## Statistical Tests

## 1. Chi-Squared Test

* To find is there any significant relation/association between two variables

In [1]:
import seaborn as sns
import pandas as pd
from scipy import stats
import numpy as np

In [2]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'geyser',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'tips',
 'titanic']

In [3]:
tips_data=sns.load_dataset('tips')
tips_data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## 1. Perform Initial Analysis


In [4]:
tips_data.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [5]:
tips_data.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [6]:
tips_data

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [7]:
tips_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [8]:
sex_smoker_data=pd.crosstab(index=tips_data.sex,columns=tips_data.smoker)
sex_smoker_data

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [9]:
chi_square_tscore,pval,df,exp_table=stats.chi2_contingency(sex_smoker_data)

In [10]:
chi_square_tscore,pval,df,exp_table = stats.chi2_contingency(sex_smoker_data)
print('Chi-square score is {}.\np value is {}.\ndf is {}.\nExpected table is \n{}'.format(chi_square_tscore,pval,df,exp_table))

Chi-square score is 0.008763290531773594.
p value is 0.925417020494423.
df is 1.
Expected table is 
[[59.84016393 97.15983607]
 [33.15983607 53.84016393]]


In [11]:
if pval<0.05:
    print("We reject the null hypothesis saying there is a significant relationship b/w smoker and sex")
else:
    print("We do not reject the null hypothesis saying that there is no significant relationship b/w smoker and sex")

We do not reject the null hypothesis saying that there is no significant relationship b/w smoker and sex


## 2. T-Test

* 1 Sample T-Test
* 2 Sample T-Test
* Paired Test 
* 3 Sample Paired Test - Anova


In [12]:
ages=[20,25,30,35,20,19,20,25,27,28,40,35,40,20,23,26,27,28,29,30,31,33,36,21,22]
len(ages)

25

In [13]:
round(np.mean(ages)) #Average age of entire Population

28

In [14]:
random_selection=np.random.choice(ages,size=5)
random_selection

array([20, 20, 29, 25, 21])

In [15]:
from scipy.stats import ttest_1samp #to perform 1 sample ttest

In [16]:
t_value,pval=ttest_1samp(random_selection,28)

In [17]:
if pval<0.05:
    print("We reject the null hypothesis saying there is a significant difference b/w avg age of pop and sample")
else:
    print("We do not reject the null hypothesis saying that there is no ignificant difference b/w avg age of pop and sample")

We reject the null hypothesis saying there is a significant difference b/w avg age of pop and sample


## 2 Sample T-Test

In [18]:
from scipy.stats import ttest_ind

In [19]:
farm_a=[100,120,95,90,105,110,99,92,119,112]
farm_b=[90,98,82,73,110,120,118,120,100,90]
t_stat,pval=ttest_ind(farm_a,farm_b)
print("The t-stat is {} and pval is {}".format(t_stat,pval))


if pval<0.05:
    print("We reject the null hypothesis saying there is a significant difference in weight(gms) b/w Apples from Farm A compared to Farm B")
else:
    print("We do not reject the null hypothesis saying that there is no significant difference in weight(gms) b/w Apples from Farm A compared to Farm B")

The t-stat is 0.6551735466451463 and pval is 0.5206419412775647
We do not reject the null hypothesis saying that there is no significant difference in weight(gms) b/w Apples from Farm A compared to Farm B


## Paired T-Test

In [20]:
from scipy.stats import ttest_rel

In [21]:
before_prog=[100,105,80,75,98]
after_prog=[95,100,70,70,90]
t_stat,pval=ttest_rel(before_prog,after_prog)
print("The t-stat is {} and pval is {}".format(t_stat,pval))

if pval<0.05:
    print("We reject the null hypothesis saying there is a significant difference in weight(Kg's) before and after weight loss program")
else:
    print("We do not reject the null hypothesis saying that there is no significant difference in weight(Kg's) before and after weight loss program")

The t-stat is 6.410486691557944 and pval is 0.003042448497193783
We reject the null hypothesis saying there is a significant difference in weight(Kg's) before and after weight loss program


## ANOVA - more than 2 Samples

In [22]:
from scipy.stats import f_oneway

In [23]:
school_a=[100,98,95,90,89]
school_b=[99,98,96,35,75]
school_c=[70,55,98,92,89]
f_stat,pval=f_oneway(school_a,school_b,school_c)
print("The f-stat is {} and pval is {}".format(f_stat,pval))

if pval<0.05:
    print("We reject the null hypothesis saying there is a significant difference in scores b/w the Students from School A,B and C")
else:
    print("We do not reject the null hypothesis saying that there is no significant difference in scores b/w the Students from School A,B and C")


The f-stat is 0.8624460167233299 and pval is 0.4467184149873274
We do not reject the null hypothesis saying that there is no significant difference in scores b/w the Students from School A,B and C


## 2 Sample Proportion

In [2]:
from scipy import stats
from scipy.stats import ttest_ind
import numpy as np

In [3]:
state1=np.random.binomial(n=1,p=0.37,size=247) #Bernoulli random numbers
state1

array([0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0])

In [4]:
state2=np.random.binomial(n=1,p=0.39,size=308)
state2

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1,

In [1]:
import statsmodels.api as sm

In [10]:
sm.stats.ttest_ind(state1,state2)

(1.2094666332366666, 0.22700042344854038, 553.0)

##### As p_value=0.227 > (assuming α = 0.05) Accept Null Hypothesis p1 not equal to p2 There is significant differnce in population proportions of state1 and state2 who report that they have been placed immediately after education. 