In [106]:
from __future__ import division
import pandas as pd
import numpy as np 
import matplotlib.pylab as plt
from code.organize import *
from code.roughEDA import *
from code.survey_processor_full import *
from code.model_vis import *
from code.fc import *
from collections import Counter,defaultdict,OrderedDict

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix,roc_curve
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA

from treeinterpreter import treeinterpreter as ti
import matplotlib.cm as cm
import cPickle as pickle
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Feature Contribution Analysis

In this session, how does each feature contribute to random forest decision making will be analysed. 
## 1. Load feature contribution matrix and group by cluster id

In [2]:
fc_df = pd.read_pickle('data/fc_df.pkl')

In [3]:
summary = fc_df.groupby('cluster')['employed'].agg([np.mean,np.size])
summary.columns = ['employment_rate','sample_size']
summary

Unnamed: 0_level_0,employment_rate,sample_size
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.067839,398
1,0.05848,684
2,0.927992,2097
3,0.077778,360
4,0.89521,668
5,0.889313,262


**Cluster 0,1,3** are unemployed groups. **Cluster 2,4,5** are employed groups. 


## 2.  Load original survey data

In [20]:
sv = survey('data/survey2')
sv.num_processor()

In [5]:
df = pd.concat([sv.fin_data[sv.num_cols],sv.data[sv.sv_cols],sv.fin_data.employed],axis =1)

f31 = fc_df.columns[:31]
df31= df[f31]
l = list(df31.columns)
l.append('cluster')

new = np.column_stack((df31.values,fc_df.cluster))
df = pd.DataFrame(new,columns=l)

### Group by cluster. Take medians for numeric variables. Take mode for categorical variables.

In [23]:
df_num = df.groupby('cluster')[sv.num_cols].agg([np.median])

In [25]:
bs_dict = {k:v for (k,v) in zip(sv.bs_flag,sv.bs)}

In [61]:
df_num.iloc[0,0] = bs_dict[98]
df_num.iloc[3:,0] = bs_dict[98]
df_num.iloc[1,3] = bs_dict[8]

In [62]:
df_num

Unnamed: 0_level_0,sphrs1,age,agekdbrn,chldidel,coninc,educom
Unnamed: 0_level_1,median,median,median,median,median,median
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,Not applicable,34,22,4.5,6633.5,12.333333
1,51,35,25,As many as want,51705.0,12.666667
2,60,36,25,4,49882.5,12.333333
3,Not applicable,31,19,5,8313.75,12.0
4,Not applicable,33,22,4,9486.5625,12.333333
5,Not applicable,34,19,4,18745.0,11.666667


In [237]:
cat_cols = list(set(df.columns) - set(sv.num_cols) - set(['cluster']))

In [102]:
from scipy.stats.mstats import mode
df.groupby('cluster')[cat_cols]

<pandas.core.groupby.DataFrameGroupBy object at 0x10b1d3b90>

In [103]:
Counter(df.divorce)

Counter({u'No': 2113, u'No answer': 8, u'Not applicable': 1840, u'Yes': 508})

## Unemployed Clusters

### Cluster 0

In [6]:
c0 = fc_df[fc_df.cluster == 0]

In [7]:
sort_feature_means(c0,10)

[(-0.10317363862869754, 'coninc'),
 (-0.03392890421279629, 'age'),
 (-0.027697984375685748, 'class_'),
 (-0.023296867668049392, 'agekdbrn'),
 (-0.019737805836251204, 'polviews'),
 (-0.019515172650691602, 'educom'),
 (0.018331218255143, 'degree'),
 (-0.01765954711605959, 'finrela'),
 (-0.016752126734590513, 'partyid'),
 (-0.014696917162165184, 'dwelling')]

In [125]:
df_c0 = df[df.cluster ==0]

In [309]:
for c in cat_cols:
    print c
    print '-'*10
    print df_c0[c].value_counts()/df_c0.shape[0]
    print "*"*20

fechld
----------
Not applicable       0.371859
Agree                0.304020
Strongly agree       0.158291
Disagree             0.138191
Strongly disagree    0.025126
Don't know           0.002513
Name: fechld, dtype: float64
********************
natrace
----------
Not applicable    0.537688
Too little        0.193467
About right       0.158291
Don't know        0.050251
Too much          0.047739
No answer         0.012563
Name: natrace, dtype: float64
********************
socfrend
----------
Not applicable      0.391960
Sev times a week    0.143216
Once a month        0.110553
Sev times a year    0.092965
Sev times a mnth    0.092965
Never               0.085427
Once a year         0.065327
Almost daily        0.017588
Name: socfrend, dtype: float64
********************
reliten
----------
Not very strong    0.409548
Strong             0.319095
No religion        0.123116
Somewhat strong    0.118090
No answer          0.030151
Name: reliten, dtype: float64
********************
res16


In [313]:
df_num.loc[0,:]

sphrs1    median    Not applicable
age       median                34
agekdbrn  median                22
chldidel  median               4.5
coninc    median            6633.5
educom    median           12.3333
Name: 0, dtype: object