# One tailed t-test

H0 = "avg_speed(new machine) > avg_speed(old machine)" --> one-sided test & the two samples are independent

In [11]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

In [66]:
#data = pd.read_csv('machine.txt', sep = ' ', encoding = 'utf-8')
data = pd.read_csv('files_for_lab/machine.txt', encoding='UTF-16' , sep = '\t')

In [67]:
data

Unnamed: 0,New machine,Old machine
0,42.1,42.7
1,41.0,43.6
2,41.3,43.8
3,41.8,43.3
4,42.4,42.5
5,42.8,43.5
6,43.2,43.1
7,42.3,41.7
8,41.8,44.0
9,42.7,44.1


In [71]:
data.columns

Index(['New machine', '    Old machine'], dtype='object')

In [78]:
sample1 = data['    Old machine']
sample2 = data['New machine']

In [79]:
# computing mean for both sample :
# old machine :

print('the mean with the old machine :', sample1.mean())
print('the mean with the new machine :', sample2.mean())

# it seems the hypothesis is false

the mean with the old machine : 43.230000000000004
the mean with the new machine : 42.14


In [80]:
# same with standard dev :

print('the std with the old machine :', sample1.std(ddof=1))
print('the std with the new machine :', sample2.std(ddof=1))

the std with the old machine : 0.7498888806572157
the std with the new machine : 0.6834552736727638


It seems that the old machine is faster (higher speed in average) but less stable (higher variance)...
Let's compute the p-value :

In [81]:
sp = ( len(sample1) - 1 ) * (sample1.std(ddof=1)**2 ) +  ( len(sample2) - 1 ) * (sample2.std(ddof=1)**2 )
r = np.sqrt( (1/len(sample1)) + (1/len(sample2)) )
t = ( sample1.mean() - sample2.mean() )/ (sp * r)

print("The ratio of the sample variances is {:.2f} which is bigger than 0.5 and smaller than 2".format(sample1.std(ddof=1)/sample2.std(ddof=1)))
print("The t statistic is: {:.2f}".format(t))

The ratio of the sample variances is 1.10 which is bigger than 0.5 and smaller than 2
The t statistic is: 0.26


In [87]:
# Computing now the critical value to check if we reject or accept H0 :
df = len(sample1) + len(sample2) - 2
st.t.ppf(0.95, df)

1.7340636066175354

-1.73 < -0.26, we accept H0 !

In [83]:
# Validating with p-value :
st.t.cdf(t, df)

0.6022566611605088

0.95 > p-value > 0.025, so we accept H0

# Matched Pairs Test

In [29]:
pokemon = pd.read_csv('files_for_lab/pokemon.csv')

In [30]:
pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


H0 : defense score = attack score // H1 : defense score != attack score
--> two-sided test & dependent samples

In [31]:
attack_vs_def = pokemon[['Attack', 'Defense']]
attack_vs_def

Unnamed: 0,Attack,Defense
0,49,49
1,62,63
2,82,83
3,100,123
4,52,43
...,...,...
795,100,150
796,160,110
797,110,60
798,160,60


In [32]:
attack_vs_def['difference'] = attack_vs_def['Attack'] - attack_vs_def['Defense']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attack_vs_def['difference'] = attack_vs_def['Attack'] - attack_vs_def['Defense']


In [33]:
sample = attack_vs_def.sample(30)
sample.head()

Unnamed: 0,Attack,Defense,difference
758,52,67,-15
133,50,35,15
679,66,84,-18
641,44,50,-6
295,70,70,0


In [34]:
sample_diff_mean, sample_diff_std = sample['difference'].mean(), sample['difference'].std(ddof=1)
sample_diff_mean, sample_diff_std

(11.866666666666667, 35.396018998180296)

In [35]:
t = sample_diff_mean / ( sample_diff_std / np.sqrt(sample.shape[0]) )
print("The mean of our samples differences is: {:.2f}".format(sample_diff_mean))
print("The standard deviation of our samples differences is: {:.2f}".format(sample_diff_std))
print("Our t statistics is: {:.2f}".format(t))

The mean of our samples differences is: 11.87
The standard deviation of our samples differences is: 35.40
Our t statistics is: 1.84


In [37]:
tc = st.t.ppf(1-(0.05/2),df= sample.shape[0] - 1)
tc

2.045229642132703

The negative of our critical value is -2.05, and our t statistic is -1.84 --> -2.05 < -1.84 --> we accept H0.

Let's try with p-value now

In [88]:
st.t.cdf(t,df= sample.shape[0] - 1)

0.6028204268276361

p-value > 0.025, so we accept H0

# ANOVA

STEPS : Null hypothesis - Alternate hypothesis - Level of significance - Test statistic - P-value - F table

In [90]:
data = pd.read_excel('files_for_lab/anova_lab_data.xlsx')

In [91]:
data.head()

Unnamed: 0,Power,Etching Rate
0,160 W,5.43
1,180 W,6.24
2,200 W,8.79
3,160 W,5.71
4,180 W,6.71


In [92]:
data.columns

Index(['Power ', 'Etching Rate'], dtype='object')

In [93]:
data.shape

(15, 2)

The null hypothesis H0 is : 'mean etching rate for power of 160W = mean etching rate for power of 180W = mean etching rate for power of 200W'
The alternative hypothesis H1 : they are different

The significance level alpha = 0.95

In [94]:
# What are the degrees of freedom of model, error terms, and total DoF
degree_freedom = len(data) - 1
degree_freedom

14

In [95]:
st.f_oneway(data[data['Power '] == '160 W']['Etching Rate'],
            data[data['Power '] == '180 W']['Etching Rate'],
            data[data['Power '] == '200 W']['Etching Rate'])

F_onewayResult(statistic=36.87895470100505, pvalue=7.506584272358903e-06)

p-value < 0.05, we reject H0

In [96]:
# F Table
S2t = 0
for power in data['Power '].unique():
    ng = len(data[data['Power '] == power])  
    S2t  += ( ( data[data['Power '] == power]['Etching Rate'].mean() - data['Etching Rate'].mean() ) ** 2) * ng
S2t /= ( data['Power '].nunique() - 1 )  
print("The value of S2t is {:.2f}".format(S2t)) 

The value of S2t is 9.09


In [97]:
S2E = 0
for power in data['Power '].unique():
    for rate in data[data['Power '] == power]['Etching Rate']:
        S2E += ( rate - data[data['Power '] == power]['Etching Rate'].mean() ) ** 2
S2E /= ( len(data) - data['Power '].nunique() ) 

print()
print("The value of S2E is {:.2f}".format(S2E))


The value of S2E is 0.25


In [98]:
# Finally we can compute F
F = S2t / S2E
print("The value of F is {:.2f}".format(F))

The value of F is 36.88


It matches with the first formula giving us the statistic. Let's compute the critical value to cross-validate the p-value-based decision

In [101]:
st.t.ppf(0.95,df= data.shape[0] - 1)

1.7613101357748562

F statistic > critical value --> we indeed reject the hypothesis H0