# Tests of hypothesis
- T test (1 sample, 2 samples, paired)
- F-test
- Chi-square test (Goodness of fit, Independence of attributes)

In [1]:
import pandas as pd
import numpy as np
from scipy import stats

# T-test: One Sample
- To test if given sample is coming from population with specified mean mu0

In [2]:
# Wt of 11 people recorded as 76,46,55,70,65,63,64,68,52,53,59. 
# Test if these people are coming from population of people with 
# average wt 65 kg.
d=np.array([76,46,55,70,65,63,54,48,52,53,59])
t=stats.ttest_1samp(d, 65)
print(t[0])
print(t[1])
print("Test Statistics is {} and P-value is {}".format(t[0],t[1]))

-2.3890864363636837
0.038017880640506756
Test Statistics is -2.3890864363636837 and P-value is 0.038017880640506756


In [3]:
from scipy.stats import t
t.cdf(-2.3890864363636837,10)*2

0.038017880640506756

# T-test: Two Sample
- To test if given samples are coming from population with same mean.

In [7]:
# Test if time taken to solve a math problem by school A and school B's students is same or not
# Are these students belongs to same population characterised by Math problem solving ability.
A_time=np.array([120,129,90,108,95,85,109,115,117,102])
B_time=np.array([122,100,119,195,103,129,118,135,118,134,128])
stats.ttest_ind(A_time,B_time)

Ttest_indResult(statistic=-2.2627502063926648, pvalue=0.035558576865823865)

# T-test: Paired
Each value of one group corresponds directly to a value in the other group, 
before and after values in an experiment. Subtract two values and perform a 
one-sample t-test with null mean set to 0.

In [5]:
# Check if there is significant effect of health drink on swimming time.
Before = np.array([302,306,350,342,310,298,285,360,341,360])
After = np.array([342,360,369,380,350,360,380,390,385,390])
stats.ttest_rel(Before,After)

Ttest_relResult(statistic=-6.700402968583348, pvalue=8.849631829519826e-05)

In [6]:
After.mean() - Before.mean()

45.200000000000045

# Chi-squre test
- For checking goodness of fit
- For testing independence of attributes
A general rule is to have cell frequencies more than 5

In [8]:
data = pd.read_csv('Titanic.csv')
data.head()

Unnamed: 0,Sr.No.,class,age,sex,survived
0,1,1st class,adults,man,yes
1,2,1st class,adults,man,yes
2,3,1st class,adults,man,yes
3,4,1st class,adults,man,yes
4,5,1st class,adults,man,yes


In [9]:
count_data=np.array([340,350,345])
print(count_data)
print(np.sum(count_data))
print(stats.chisquare(count_data))

[340 350 345]
1035
Power_divergenceResult(statistic=0.14492753623188406, pvalue=0.9300994453050568)


-Test if following data ('class' of passangers in Titanic data) coming from uniform distribution 
or values are equqlli likely.

In [16]:
count_data=np.array(pd.value_counts(data['class']))
print(count_data)
print(np.sum(count_data))
print(stats.chisquare(count_data))
stats.chisquare(count_data,f_exp=np.repeat(round(len(data['class'])/3),3))

[706 325 285]
1316
Power_divergenceResult(statistic=246.20212765957444, pvalue=3.4505415574260565e-54)


Power_divergenceResult(statistic=246.01594533029612, pvalue=3.7871825666223953e-54)

In [15]:
pd.value_counts(data['class'])

3rd class    706
1st class    325
2nd class    285
Name: class, dtype: int64

-Test if in Titanic data 'class' of passanger and 'survival' depends on each other on not

In [11]:
dfct= pd.crosstab(data['class'],data['survived'])
dfct

survived,no,yes
class,Unnamed: 1_level_1,Unnamed: 2_level_1
1st class,122,203
2nd class,167,118
3rd class,528,178


In [12]:
chi2, p, dof, exp = stats.chi2_contingency(dfct)

In [13]:
print("Test Statistics=",chi2)
print("P-value=",p)
print("D.F.=", dof)
print("Expected Freq=",exp)

Test Statistics= 133.05203598610018
P-value= 1.282677517613224e-29
D.F.= 2
Expected Freq= [[201.76671733 123.23328267]
 [176.93389058 108.06610942]
 [438.2993921  267.7006079 ]]


In [22]:
dfct= pd.crosstab(data['age'],data['survived'])
dfct

survived,no,yes
age,Unnamed: 1_level_1,Unnamed: 2_level_1
adults,765,442
child,52,57


In [23]:
chi2, p, dof, exp = stats.chi2_contingency(dfct)
print("Test Statistics=",chi2)
print("P-value=",p)
print("D.F.=", dof)
print("Expected Freq=",exp)

Test Statistics= 9.778024759256368
P-value= 0.001766099397795589
D.F.= 1
Expected Freq= [[749.33054711 457.66945289]
 [ 67.66945289  41.33054711]]


## F-test
- For equality of variances

In [19]:
import numpy as np

#define F-test function
def f_test(x, y):
    x = np.array(x)
    y = np.array(y)
    f = np.var(x, ddof=1)/np.var(y, ddof=1) #calculate F test statistic 
    dfn = x.size-1 #define degrees of freedom numerator 
    dfd = y.size-1 #define degrees of freedom denominator 
    p = 1-scipy.stats.f.cdf(f, dfn, dfd) #find p-value of F test statistic 
    return f, p

In [20]:
x = [18, 19, 22, 25, 27, 28, 41, 45, 51, 55]
y = [14, 15, 15, 17, 18, 22, 25, 25, 27, 34]

In [21]:
#perform F-test
f_test(x, y)

(4.387122002085505, 0.01912653593238578)

In [18]:
import scipy