In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Chi squared

In [None]:
titanic = pd.read_csv('titanic.txt', sep='\t')

The Chi square test compares the observed coincidence of two categorical variables with what the expected coincidence would be if they were independent.

Going back to the Titanic data, we can see the observed coincidence of `Sex` and `Survived` in a *contingency table* (what `pandas` calls a `crosstab`):

In [None]:
pd.crosstab(titanic.Sex, titanic.Survived, margins=True)

We can pass that table to the `contingency.expected_freq` function from `scipy.stats` to see what numbers we'd expected if the two variables were independent:

In [None]:
stats.contingency.expected_freq(pd.crosstab(titanic.Sex, titanic.Survived, margins=True))


So it certainly looks like there's something going on. We can pass that crosstab into `chi2_contingency` to carry out the hypothesis test with:

$H_0$: The variables `Sex` and `Survived` are independent

$H_1$: There is an association between `Sex` and `Survived`

In [None]:
titanic_chi2 = stats.chi2_contingency(pd.crosstab(titanic.Sex, titanic.Survived, margins=True))
titanic_chi2

The first item is the chi square statistic, the second is the p value, and the third is the expected contingency table if the null hypothesis were true.

The high chi square statistic and very low p value strongly suggest that these two variables are *not* independent.

# t tests

In [None]:
from ipywidgets import interact, IntSlider, FloatSlider

In [None]:
def update(n=30,meanA=50,stdA=1,meanB=50,stdB=1,alpha=0.05):
    # generate two sets of normally distributed data
    groupA = np.random.normal(meanA, stdA, n)
    groupB = np.random.normal(meanB, stdB, n)
    # plot them
    sns.distplot(groupA)
    sns.distplot(groupB)
    # apply an independent t-test
    ttest_result = stats.ttest_ind(groupA,groupB, equal_var=False)
    s = '''
    meanA = {}
    meanB = {}
    H0: meanA = meanB
    H1: meanA <> meanB
    t = {}
    '''  
    if (ttest_result.pvalue) <= alpha:
        s+= '''
        p = {} <= {}
        Reject H0 at the {} significance level
        '''
    else:
        s+= '''
        p = {} > {}
        Fail to reject H0 at the {} significance level
        '''
    print(s.format(groupA.mean().round(2),groupB.mean().round(2),ttest_result.statistic, ttest_result.pvalue, alpha, alpha))
interact(update,
         n=IntSlider(value=30,min=3,max=100,step=1,continuous_update=False),
         meanA=IntSlider(value=50,min=10,max=100,step=1,continuous_update=False),
         stdA=IntSlider(value=1,min=1,max=10,step=1,continuous_update=False),
         meanB=IntSlider(value=50,min=10,max=100,step=1,continuous_update=False),
         stdB=IntSlider(value=1,min=1,max=10,step=1,continuous_update=False),
         alpha=FloatSlider(value=0.05,min=0.01,max=0.1,step=0.01,continuous_update=False)
         );