# Predictive Analytics
# Module 3 - Inferential Statistics
## Demo 5 - Null Hypothesis, Z-Statistic & t-Tests

In [1]:
import pandas as pd
import numpy as np

### Loading sample dataset

In [2]:
from sklearn.datasets import load_boston
from sklearn.utils import shuffle

boston = load_boston()
X, y = shuffle(boston.data, boston.target, random_state=13)

In [3]:
df = pd.DataFrame(boston.data,columns= boston['feature_names'])
df['target'] = boston.target
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


### Check if the sample mean differs from the population mean?

In [4]:
# Population mean and population standard deviation
population_target = df.target
mu = population_target.mean()
sigma = population_target.std()
print("Population Mean:", mu,"\nPopulation std:", sigma)

Population Mean: 22.532806324110677 
Population std: 9.197104087379818


In [5]:
# Drawing sample of 100
sample_size = 100
sample = df.sample(n=sample_size, random_state=4).target
sample_mu = sample.mean()
sample_sigma = sample.std()
print("Sample Mean:", sample_mu,"\nSample std:", sample_sigma)

Sample Mean: 23.611 
Sample std: 9.668514922576312


### Perform Null Hypothesis using Z-statistic

In [6]:
"""
Null Hypothesis: population mean = sample mean
"""
import math
from scipy import stats

#margin_of_error = z_critical * (pop_stdev/math.sqrt(sample_size))
z_critical = stats.norm.ppf(q = 0.975) #1.96
N = 100
SE = sigma/np.sqrt(N)
z_stat = (sample_mu - mu)/SE
print("Z-Statistic: ", z_stat)

print("Z-Critical: ", z_critical)


Z-Statistic:  1.1723186620980088
Z-Critical:  1.959963984540054


### One sample t-Test

#### t-Test at 95% confidence level?

In [7]:
stats.ttest_1samp(a= sample,               # Sample data
                 popmean= population_target.mean())  # Pop mean

Ttest_1sampResult(statistic=1.1151595508961825, pvalue=0.2674817704191442)

##### Check if a 95% confidence interval will capture the population mean of 22.53? 

In [8]:
sigma = sample.std()/math.sqrt(100)  # Sample stdev/sample size

stats.t.interval(0.95,                        # Confidence level
                 df = 49,                     # Degrees of freedom
                 loc = sample.mean(), # Sample mean
                 scale= sigma)           

(21.66803918573013, 25.55396081426987)

## Two sample t-Test
### Check if the mean of two independent data samples differ from one another?

In [9]:
# Drawing sample of 100
sample2 = df.sample(n=100, random_state=88).target
sample_mu2 = sample2.mean()
sample_sigma2 = sample2.std()
print("Sample Mean2:", sample_mu2,"\nSample std2:", sample_sigma2)

Sample Mean2: 22.099000000000004 
Sample std2: 8.587005628073483


In [10]:

print(stats.ttest_ind(a= sample,
                b= sample2,
                equal_var=False))  

"""
There is a 24% chance we'd see sample data this far apart 
if the two groups tested are actually identical
"""

Ttest_indResult(statistic=1.169261841237422, pvalue=0.24372341056286792)


"\nThere is a 24% chance we'd see sample data this far apart \nif the two groups tested are actually identical\n"