In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import ttest_1samp, ttest_ind, ttest_rel
from scipy.stats import norm, t

# Improve IQ

Population IQ average = 100

One researcher claims his pill will improve

In [2]:
# The pill is given to a few people and their IQ is tested with following results:
iq_scores = [110, 105, 98, 102, 99, 104, 115, 95]

In [3]:
np.mean(iq_scores)

103.5

H0: mu = 100

Ha: mu > 100

In [4]:
alpha = 0.01

In [5]:
t_stat, p_value = ttest_1samp(iq_scores, 100)

In [6]:
p_value

0.1754994493585011

In [7]:
if p_value < alpha:
    print("Reject H0")
else:
    print("Fail to reject H0")

Fail to reject H0


# Two drugs recovery time

In [8]:
d1 = pd.read_csv("drug_1_recovery.csv")
d1

Unnamed: 0,drug_1
0,8.824208
1,7.477745
2,7.557121
3,7.981314
4,6.827716
...,...
95,6.890506
96,7.725759
97,6.848016
98,7.969997


In [9]:
d1.mean()

drug_1    7.104917
dtype: float64

In [10]:
d2 = pd.read_csv("drug_2_recovery.csv")

In [11]:
d2.mean()

drug_2    8.073423
dtype: float64

In [12]:
len(d1)

100

In [13]:
len(d2)

120

In [14]:
# H0: mu1 = mu2 (both drugs are similar)
# Ha: mu1 != mu2
t_stat, p_value = ttest_ind(d1, d2)
print(p_value)
if p_value < 0.05:
    print("Reject H0")
else:
    print("Fail to reject H0")

[2.55427147e-07]
Reject H0


In [15]:
# Left tailed
# H0: mu1 = mu2
# Ha: mu1 < mu2
t_stat, p_value = ttest_ind(d1, d2, alternative="less")
print(p_value)
if p_value < 0.05:
    print("Reject H0")
else:
    print("Fail to reject H0")

[1.27713574e-07]
Reject H0


In [16]:
# Right tailed
# H0: mu1 = mu2
# Ha: mu1 > mu2
t_stat, p_value = ttest_ind(d1, d2, alternative="greater")
print(p_value)
if p_value < 0.05:
    print("Reject H0")
else:
    print("Fail to reject H0")

[0.99999987]
Fail to reject H0


# Sachin data

In [17]:
df = pd.read_csv("Sachin_ODI.csv")

In [18]:
df.head()

Unnamed: 0,runs,NotOut,mins,bf,fours,sixes,sr,Inns,Opp,Ground,Date,Winner,Won,century
0,13,0,30,15,3,0,86.66,1,New Zealand,Napier,1995-02-16,New Zealand,False,False
1,37,0,75,51,3,1,72.54,2,South Africa,Hamilton,1995-02-18,South Africa,False,False
2,47,0,65,40,7,0,117.5,2,Australia,Dunedin,1995-02-22,India,True,False
3,48,0,37,30,9,1,160.0,2,Bangladesh,Sharjah,1995-04-05,India,True,False
4,4,0,13,9,1,0,44.44,2,Pakistan,Sharjah,1995-04-07,Pakistan,False,False


In [19]:
df_first_innings = df[df["Inns"]==1]
df_second_innings = df[df["Inns"]==2]

In [20]:
df_first_innings["runs"].mean()

46.67058823529412

In [21]:
df_second_innings["runs"].mean()

40.17368421052632

In [22]:
# H0: first innings = second innings (mu1 = mu2)
# Ha: mu1 > mu2
alpha = 0.05
test_stat, p_value = ttest_ind(df_first_innings["runs"], df_second_innings["runs"], alternative="greater")
print(test_stat)
print(p_value)
if p_value < alpha:
    print("Reject H0")
else:
    print("Fail to reject")

1.4612016295532178
0.07241862097379981
Fail to reject


In [23]:
df_won = df[df["Won"]==True]
df_lost = df[df["Won"]==False]

In [24]:
df_won["runs"].mean()

51.0

In [25]:
df_lost["runs"].mean()

35.13068181818182

In [26]:
# H0: mu1 = mu2
# Ha: mu1 > mu2
alpha = 0.05
test_stat, p_value = ttest_ind(df_won["runs"], df_lost["runs"], alternative="greater")
print(test_stat)
print(p_value)
if p_value < alpha:
    print("Reject H0")
else:
    print("Fail to reject")

3.628068563969343
0.00016353077486826558
Reject H0


In [27]:
# H0: mu1 = mu2
# Ha: mu1 < mu2
alpha = 0.05
test_stat, p_value = ttest_ind(df_won["runs"], df_lost["runs"], alternative="less")
print(test_stat)
print(p_value)
if p_value < alpha:
    print("Reject H0")
else:
    print("Fail to reject")

3.628068563969343
0.9998364692251318
Fail to reject


# Problems

The average number of customers entering a store every day is 500,
with a standard deviation of 125.

A marketing company claims to increase this number.
After 70 days, it is seen that the average is 530.
Test the claim of the marketing company at a 95% confidence (or 5% significance)

In [28]:
# H0: mu = 500 (Despite marketing, avg custormers will be 500. "Marketing has no effect")
# Ha: mu > 500

# Test statistic: z-score of the sample mean
# Right tailed

z_stat = (530 - 500) / (125/np.sqrt(70))
p_value = 1 - norm.cdf(z_stat)
alpha = 0.05
print(p_value)
if p_value < alpha:
    print("Reject H0")
else:
    print("Fail to reject H0")

0.022322492581293485
Reject H0


A local football game sees 3.5 goals per match on average, with standard deviation of 0.7. 
A sample of 45 matches was taken. 
What should be the maximum average goal of these 45 matches such that we can continue
to believe the statement that the population average is 3.5 goals, at a 10% significance (90% confidence)

In [29]:
# H0: mu = 3.5
# Ha: mu > 3.5

# Test stat: z-score
# Right tailed

alpha = 0.1
z_critical = norm.ppf(0.9)
x = 3.5 + z_critical * 0.7/np.sqrt(45)
x

3.633729699470687

The number of hours a student spends solving assessment problems across two batches are known.
Test if one batch takes lesser time, significance of 5%

In [30]:
batch_1 = pd.Series([25,30,45,49,47,35,32,42])
batch_2 = pd.Series([45,47,25,22,29,32,27,28,40,49,50,33])

In [31]:
batch_1.mean()

38.125

In [32]:
batch_2.mean()

35.583333333333336

In [33]:
# H0: mu1 = mu2
# Ha: mu1 > mu2
t_stat, p_value = ttest_ind(batch_1, batch_2, alternative="greater")
print("Test statistic = ", t_stat)
print("P-value= ", p_value)
if p_value < 0.05:
    print("Reject H0")
else: 
    print("Fail to reject H0")

Test statistic =  0.5795450171026676
P-value=  0.2847023809445894
Fail to reject H0


# From scratch

In [34]:
def ttest_ind_from_data(d1, d2, alternative="two-sided"):
    """
    d1: pandas Series
    d2: pandas Series
    alternative: {‘two-sided’, ‘less’, ‘greater’}, optional
    """
    n1 = len(d1)
    n2 = len(d2)
    
    m1 = d1.mean()
    m2 = d2.mean()
    
    s1 = d1.std()
    s2 = d2.std()
    
    df = n1 + n2 - 2
    
    s = np.sqrt((((n1-1)*(s1**2)) + ((n2-1)*(s2**2))) / (n1 + n2 - 2))
    
    t_stat = (m1 - m2) / (s*np.sqrt(1/n1+ 1/n2))
    
    if alternative == "two-sided":
        p_value = 2*(1 - t.cdf(t_stat, df=df))
    if alternative == "less":
        p_value = t.cdf(t_stat, df=df)
    if alternative == "greater":
        p_value = 1 - t.cdf(t_stat, df=df)
    print("T-stat = ", t_stat)
    print("P-value = ", p_value)

In [35]:
ttest_ind_from_data(batch_1, batch_2)

T-stat =  0.5795450171026676
P-value =  0.5694047618891789


In [36]:
ttest_ind(batch_1, batch_2)

Ttest_indResult(statistic=0.5795450171026676, pvalue=0.5694047618891788)

In [37]:
norm.cdf(-1.74)

0.040929508978807365

In [38]:
1 - norm.cdf(4.71)

1.2385839573969548e-06

In [39]:
data = [193, 321, 222, 158, 176, 149, 154, 223, 233, 177, 280, 244, 138, 210, 167, 129, 254, 167, 194, 191, 128, 191, 144, 184, 330, 216, 212, 142, 216, 197, 231, 133, 205, 192, 195, 243, 224, 137, 234, 171, 176, 249, 222, 234, 191]

In [40]:
ttest_1samp(data, 190, alternative = "greater")

Ttest_1sampResult(statistic=1.3689029903414232, pvalue=0.08898891556150607)

In [41]:
female_scores=[25,30,45,49,47,35,32,42]

male_scores=[45,47,25,22,29,32,27,28,40,49,50,33]

In [42]:
ttest_ind(female_scores, male_scores, alternative="less")

Ttest_indResult(statistic=0.5795450171026676, pvalue=0.5694047618891788)

In [47]:
wt_before=[85, 74, 63.5, 69.4, 71.6, 65,90,78]

wt_after=[82, 71, 64, 65.2, 67.8, 64.7,95,77]

ttest_rel(wt_after, wt_before, alternative="less")

Ttest_relResult(statistic=-1.142185379355503, pvalue=0.14546808501326391)