<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Frequentist Hypothesis Testing Lab


---

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(font_scale=1.5)

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
import scipy.stats as stats

## Load the titanic data

In [3]:
df = pd.read_csv('../../../../resource-datasets/titanic/titanic_clean.csv')

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


## Use a t-test to assess if the survival was the same among male and female passengers

In [5]:
stats.ttest_ind(df.Survived[df.Sex == 'male'], df.Survived[df.Sex == 'female'], equal_var=False)

Ttest_indResult(statistic=-16.64705581733569, pvalue=5.545890959911458e-50)

## Use a t-test to assess if the mean age was the same among survivors and non-survivors

In [9]:
stats.ttest_ind(df.Age[df.Survived == 0], df.Age[df.Survived == 1], equal_var=False)

Ttest_indResult(statistic=2.1844862149398256, pvalue=0.02931497754601072)

In [10]:
stats.ttest_ind(df.Age[df.Survived == 0], df.Age[df.Survived == 1], equal_var=False)[1]

0.02931497754601072

## Test for equality of survival rate in different passenger classes or ports of embarkment

In [7]:
stats.f_oneway(*[df.Survived[df.Pclass == i] for i in [1, 2, 3]])

F_onewayResult(statistic=52.000520262229195, pvalue=8.448797786668182e-22)

In [11]:
stats.f_oneway(*[df.Survived[df.Embarked == i] for i in list(df.Embarked.unique())])

F_onewayResult(statistic=14.467835666097676, pvalue=6.942734772300874e-07)

## Use a chi2 test to test if the proportions of male/female are the same in survived/not survived

Find other groups whose proportions you could compare.

In [27]:
# the observed breakdown:
cross_tab = pd.crosstab(df.Sex, df.Survived)
cross_tab

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,64,195
male,360,93


In [37]:
stats.chi2_contingency(cross_tab)

# this gives the 'expected' table as well, had the the two variables been independent of each other, ie.
# np.outer() multiplication (see cell below)

(202.86944877617123,
 4.939416685451492e-46,
 1,
 array([[154.23595506, 104.76404494],
        [269.76404494, 183.23595506]]))

In [36]:
# manually calculating the expected table:
sex_counts = df.Sex.value_counts(normalize=True, sort=False)
survived_counts = df.Survived.value_counts(normalize=True, sort=False)
contingency_table = np.outer(sex_counts,survived_counts)*len(df)
contingency_table

array([[269.76404494, 183.23595506],
       [154.23595506, 104.76404494]])