In [1]:
import numpy as np #For generating random numbers
import pandas as pd #for working with data frames or say random numbers
import scipy.stats as stats #for performing statistical tests
from scipy.stats import t

In [2]:
np.random.seed(42) #Set a random seed for reproducibility meaning giving a number to initialize a number generator
n = 10000

# Here we are generating a large datset or say the population we intend to test our business hunch on
1. Below dataset is a single column dataset of people. We are assigning random values to each people.
2. This population has equal representation of men and women
3. **loc**: This represents the mean of normal distribution
4. **scale**: This represents the standard deviation of normal distribution
5. **size**: This determines the number of random values we want to generate
6. Generally a population has all of these characteristics inherent, but since we are creating one we need to apply these values to be able to create a population of some kind
# Why do we need normally distributed sample?
1. Because we assume that the population variable is continuous in nature and normally distributed since that is the primary assumption of T test
2. It is a common choice of distribution for modelling continous variables making it convenient to generate random values that approximate real worl data
# What is normal distibution
1. bell curve
2. peak is at mean
3. mean is arithmatic average of all values
4. standard deviation shows average spread of values around the mean.
5. higher deviation larger bell.lower deviation smaller bell
6. -for a normally distributed population; 68% values of population lie under one standard deviation, 95% under 2 and 99.7% under 3 standard deviation
7. real life scenarios involving height, test scores, clicks, impressions, sales, errors etc. roughly follow normal distribution
8. it simplifies calculations and allows for using powerful statistical calculations easily

In [3]:
population = pd.DataFrame({'value':np.random.normal(loc=50, scale=10,size=n)})
population.head()

Unnamed: 0,value
0,54.967142
1,48.617357
2,56.476885
3,65.230299
4,47.658466


# Here we will perform stratified random sampling to ensure that test and control roughly represent similar male and female distribution as populations
1. Assigning genders to populations
# How np.random.choice works
1. takes population['value'] column as an array of values to choose from
2. size specifies the output as in how many choices do we need to make; in our case it is n i.e. equal to population size p specifies the proportion associated with each elemtn in the array. here we choose 50-50 split of both

In [4]:
population['gender']=np.random.choice(['male','female'],size=n,p=[0.5,0.5])
population.head()

Unnamed: 0,value,gender
0,54.967142,male
1,48.617357,male
2,56.476885,male
3,65.230299,male
4,47.658466,female


# performing stratified random sampling
1. We group by the data by gender column
2. we take group_keys = False meaning we do not want to form an extra index in new dataframe
3. apply function applies the mentioned function in its bracket to each element of the dataframe, in this case to each group sample will randomly choose 20% from each group of male and female
4. Here we choose 20% females from female records and similarly 20% male from male records and form a 20% of entire population sample size

In [5]:
stratified_sample = population.groupby('gender',group_keys=False).apply(lambda x:x.sample(frac=0.2))
stratified_sample.head()

Unnamed: 0,value,gender
3963,57.688397,female
3935,53.88312,female
7792,40.749154,female
2263,39.289463,female
6025,51.165785,female


# We get the info of the new DataFrame to ensure we are getting what we need

In [6]:
stratified_sample.info

<bound method DataFrame.info of           value  gender
3963  57.688397  female
3935  53.883120  female
7792  40.749154  female
2263  39.289463  female
6025  51.165785  female
...         ...     ...
8531  35.572852    male
808   62.557561    male
9841  63.345061    male
3110  38.382162    male
4374  59.944474    male

[2000 rows x 2 columns]>

# Now we split the same into test and control groups

In [7]:
groupby_object = stratified_sample.groupby('gender',group_keys=False)
control_sample = groupby_object.apply(lambda x:x.sample(frac=0.5))
control_sample.head()

Unnamed: 0,value,gender
3655,53.641403,female
1096,50.786352,female
1385,52.992926,female
8309,47.34935,female
1487,32.391912,female


In [8]:
control_sample.info

<bound method DataFrame.info of           value  gender
3655  53.641403  female
1096  50.786352  female
1385  52.992926  female
8309  47.349350  female
1487  32.391912  female
...         ...     ...
930   44.448005    male
1681  49.758044    male
1276  73.193295    male
3983  38.765060    male
9070  59.229776    male

[1000 rows x 2 columns]>

In [9]:
test_sample = stratified_sample.drop(index = control_sample.index)
test_sample.head()

Unnamed: 0,value,gender
3935,53.88312,female
2263,39.289463,female
6025,51.165785,female
229,56.795977,female
1630,54.847328,female


In [10]:
test_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 3935 to 9841
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   value   1000 non-null   float64
 1   gender  1000 non-null   object 
dtypes: float64(1), object(1)
memory usage: 23.4+ KB


# Here just test the type of object we just ended up creating to ensure we created what we needed i.e. a DataFrame

In [11]:
type(control_sample),type(test_sample)

(pandas.core.frame.DataFrame, pandas.core.frame.DataFrame)

# Defining Hypothesis
1. H0 = There is no significant difference in the mean of 2 groups
2. H1 = There is significant difference in the mean of 2 groups

In [12]:
t_stat,p_value = stats.ttest_ind(control_sample['value'],test_sample['value'])
print('t_stat: ',t_stat,' p_value: ',p_value)

t_stat:  0.6876953016959468  p_value:  0.49172450195546025


#  Alpha = significane = 0.05%

# Results Interpretation
1. If P_value < Alpha then reject the null hypothesis >> there is a significant difference
2. If P_Value > Alpha then fail to reject the null hypothesis >> There is no significant difference in the two

# Getting minimum and maximum t statistic for given samples
# Steps will be as follows
1. Calculate degrees of freedom
2. calculate the minimum and maximum t statistic

In [13]:
degrees_of_freedom = len(control_sample) + len(control_sample) - 2
degrees_of_freedom

1998

In [14]:
t_min = t.ppf(0.001,degrees_of_freedom)
t_max = t.ppf(0.999,degrees_of_freedom)
t_min,t_max

(-3.094316387347393, 3.0943163873473924)

# t-statistic interpretation
Higher the t value greater the difference between the means