In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from itertools import combinations

In [69]:
data = pd.read_csv('churn_analysis.csv', index_col=0)
pd.set_option('display.max_columns', None)
data.shape

(3333, 22)

In [70]:
data.head()

Unnamed: 0,state,account_length,area_code,intl_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,eve_mins,eve_calls,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls,treatment,mes_estim,churn
0,KS,128,415,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,1,0.65,False.
1,OH,107,415,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0,0.55,False.
2,NJ,137,415,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0,0.72,False.
3,OH,84,408,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,1,0.28,False.
4,OK,75,415,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,2,0.45,False.


In [71]:
data.churn.loc[data.churn == "True."] = True
data.churn.loc[data.churn == "False."] = False

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [78]:
pt = data[data.treatment == 1].pivot_table(values='treatment', columns='churn', index='state', aggfunc=len, fill_value=0)
pt.head()

churn,False,True
state,Unnamed: 1_level_1,Unnamed: 2_level_1
AK,19,1
AL,25,5
AR,11,5
AZ,17,2
CA,10,5


In [80]:
chi2_val = pd.DataFrame(columns=['state_1', 'state_2', 'chi2_stat', 'p_value'])
for i, (st1, st2) in enumerate(combinations(pt.index, 2)):
    table = pt.loc[[st1, st2], :]
    chi2_val.loc[i, 'state_1'] = st1
    chi2_val.loc[i, 'state_2'] = st2
    stat = stats.chi2_contingency(table.values, correction=False)
    chi2_val.loc[i, 'chi2_stat'] = stat[0]
    chi2_val.loc[i, 'p_value'] = stat[1]

In [81]:
len(chi2_val[chi2_val.p_value < 0.05]) #1

34

In [88]:
chi2_val.head()

Unnamed: 0,state_1,state_2,chi2_stat,p_value
0,AK,AL,1.54672,0.213621
1,AK,AR,4.41,0.0357288
2,AK,AZ,0.419079,0.517397
3,AK,CA,4.84435,0.0277369
4,AK,CO,2.68864,0.101066


In [91]:
chi2_val_corr = pd.DataFrame(columns=['state_1', 'state_2', 'chi2_stat', 'p_value'])
for i, (st1, st2) in enumerate(combinations(pt.index, 2)):
    table = pt.loc[[st1, st2], :]
    chi2_val_corr.loc[i, 'state_1'] = st1
    chi2_val_corr.loc[i, 'state_2'] = st2
    stat = stats.chi2_contingency(table.values, correction=True)
    chi2_val_corr.loc[i, 'chi2_stat'] = stat[0]
    chi2_val_corr.loc[i, 'p_value'] = stat[1]

In [86]:
chi2_val_corr.head()

Unnamed: 0,state_1,state_2,chi2_stat,p_value
0,AK,AL,0.639205,0.423999
1,AK,AR,2.7225,0.0989429
2,AK,AZ,0.00213816,0.963119
3,AK,CA,3.05496,0.0804911
4,AK,CO,1.4358,0.230821


In [87]:
len(chi2_val_corr[chi2_val_corr.p_value < 0.05]) #3

0

In [92]:
fisher_val = pd.DataFrame(columns=['state_1', 'state_2', 'chi2_stat', 'p_value'])
for i, (st1, st2) in enumerate(combinations(pt.index, 2)):
    table = pt.loc[[st1, st2], :]
    fisher_val.loc[i, 'state_1'] = st1
    fisher_val.loc[i, 'state_2'] = st2
    stat = stats.fisher_exact(table)
    fisher_val.loc[i, 'chi2_stat'] = stat[0]
    fisher_val.loc[i, 'p_value'] = stat[1]

In [94]:
fisher_val.p_value.mean()

0.6483383060020687

In [95]:
chi2_val_corr.p_value.mean()

0.6640566382051045

In [96]:
chi2_val.p_value.mean()

0.5018273798739152

In [98]:
data[['day_calls', 'mes_estim']].corr()

Unnamed: 0,day_calls,mes_estim
day_calls,1.0,-0.051794
mes_estim,-0.051794,1.0


In [99]:
stats.pearsonr(data.day_calls, data.mes_estim)

(-0.051794350587572625, 0.0027798836869756707)

In [101]:
data[['day_calls', 'mes_estim']].corr(method='spearman')

Unnamed: 0,day_calls,mes_estim
day_calls,1.0,0.04335
mes_estim,0.04335,1.0


In [100]:
stats.spearmanr(data.day_calls, data.mes_estim)

SpearmanrResult(correlation=0.043349880533927444, pvalue=0.012317367189170541)

In [102]:
def Cramers_stat(confusion_matrix):
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    return np.sqrt(chi2 / (n*(min(confusion_matrix.shape)-1)))

In [105]:
Cramers_stat(pt.values)

0.2003932150203332