<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Frequentist Hypothesis Testing Lab


---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
import scipy.stats as stats

## Load the titanic data

In [3]:
df = pd.read_csv('../../../../resource-datasets/titanic/titanic_clean.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


## Use a t-test to assess if the survival was the same among male and female passengers

In [5]:
df.groupby('Sex')[['Survived']].agg(['mean', 'std', 'count'])

Unnamed: 0_level_0,Survived,Survived,Survived
Unnamed: 0_level_1,mean,std,count
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.752896,0.432163,259
male,0.205298,0.404366,453


In [6]:
stats.ttest_ind(df['Survived'][df.Sex=='male'], df['Survived'][df.Sex=='female'])

Ttest_indResult(statistic=-16.95138209331607, pvalue=2.2428516419834136e-54)

In [7]:
stats.ttest_ind(df['Survived'][df.Sex=='male'], df['Survived'][df.Sex=='female'], equal_var=False)

Ttest_indResult(statistic=-16.64705581733569, pvalue=5.545890959911458e-50)

## Use a t-test to assess if the mean age was the same among survivors and non-survivors

In [8]:
stats.ttest_ind(df['Age'][df.Survived==0], df['Age'][df.Survived==1], equal_var=False)

Ttest_indResult(statistic=2.1844862149398256, pvalue=0.02931497754601072)

In [9]:
stats.ttest_ind(df['Fare'][df.Survived==0], df['Fare'][df.Survived==1], equal_var=False)

Ttest_indResult(statistic=-6.4669402027739755, pvalue=3.214223132961829e-10)

## Test for equality of survival rate in different passenger classes or ports of embarkment

In [10]:
stats.f_oneway(*[df['Survived'][df.Pclass==pclass] for pclass in df.Pclass.unique()])

F_onewayResult(statistic=52.000520262229195, pvalue=8.448797786668182e-22)

In [11]:
stats.f_oneway(*[df['Survived'][df.Embarked==port] for port in df.Embarked.unique()])

F_onewayResult(statistic=14.467835666097676, pvalue=6.942734772300874e-07)

## Use a chi2 test to test if the proportions of male/female are the same in survived/not survived

Find other groups whose proportions you could compare.

In [12]:
pd.crosstab(df.Survived, df.Sex)

Sex,female,male
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1
0,64,360
1,195,93


In [13]:
stats.chi2_contingency(pd.crosstab(df.Survived, df.Sex))

(202.86944877617123,
 4.939416685451492e-46,
 1,
 array([[154.23595506, 269.76404494],
        [104.76404494, 183.23595506]]))

In [14]:
pd.crosstab(df.Survived, df.Pclass)

Pclass,1,2,3
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,64,90,270
1,120,83,85


In [15]:
stats.chi2_contingency(pd.crosstab(df.Survived, df.Pclass))

(91.08074548791019,
 1.6675060315554636e-20,
 2,
 array([[109.57303371, 103.02247191, 211.40449438],
        [ 74.42696629,  69.97752809, 143.59550562]]))

In [16]:
pd.crosstab(df.Embarked, df.Pclass)

Pclass,1,2,3
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C,74,15,41
Q,2,2,24
S,108,156,290


In [17]:
stats.chi2_contingency(pd.crosstab(df.Embarked, df.Pclass))

(93.18034874220332,
 2.7775485300373503e-19,
 4,
 array([[ 33.59550562,  31.58707865,  64.81741573],
        [  7.23595506,   6.80337079,  13.96067416],
        [143.16853933, 134.60955056, 276.22191011]]))

In [18]:
pd.crosstab(df.Embarked, df.Sex)

Sex,female,male
Embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,61,69
Q,12,16
S,186,368


In [19]:
stats.chi2_contingency(pd.crosstab(df.Embarked, df.Sex))

(8.635939718372466,
 0.013326911604945089,
 2,
 array([[ 47.28932584,  82.71067416],
        [ 10.18539326,  17.81460674],
        [201.5252809 , 352.4747191 ]]))

In [20]:
pd.crosstab(df.Pclass, df.Sex)

Sex,female,male
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,83,101
2,74,99
3,102,253


In [21]:
stats.chi2_contingency(pd.crosstab(df.Pclass, df.Sex))

(18.08484036123574,
 0.00011828422058509222,
 2,
 array([[ 66.93258427, 117.06741573],
        [ 62.93117978, 110.06882022],
        [129.13623596, 225.86376404]]))