In [45]:
from __future__ import division
import pandas as pd
import numpy as np 
import matplotlib.pylab as plt
from code.organize import *
from code.roughEDA import *
from code.survey_processor_full import *
from code.model_vis import *
from code.fc import *
from collections import Counter,defaultdict,OrderedDict

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix,roc_curve
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA

from treeinterpreter import treeinterpreter as ti
import matplotlib.cm as cm
import cPickle as pickle
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Feature Contribution Analysis

In this session, how does each feature contribute to random forest decision making will be analysed. 
## 1. Load feature contribution matrix and group by cluster id

In [2]:
fc_df = pd.read_pickle('data/fc_df.pkl')

In [3]:
summary = fc_df.groupby('cluster')['employed'].agg([np.mean,np.size])
summary.columns = ['employment_rate','sample_size']
summary

Unnamed: 0_level_0,employment_rate,sample_size
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.067839,398
1,0.05848,684
2,0.927992,2097
3,0.077778,360
4,0.89521,668
5,0.889313,262


**Cluster 0,1,3** are unemployed groups. **Cluster 2,4,5** are employed groups. 


## 2.  Load original survey data

In [4]:
sv = survey('data/survey2')
sv.num_processor()

In [5]:
df = pd.concat([sv.fin_data[sv.num_cols],sv.data[sv.sv_cols],sv.fin_data.employed],axis =1)

f31 = fc_df.columns[:31]
df31= df[f31]
l = list(df31.columns)
l.append('cluster')

new = np.column_stack((df31.values,fc_df.cluster))
df = pd.DataFrame(new,columns=l)

### Group by cluster. Take medians for numeric variables. Take mode for categorical variables.

In [12]:
df[sv.num_cols] = df[sv.num_cols].astype('float')

In [17]:
df_num = df.groupby('cluster')[sv.num_cols].agg([np.median])

In [18]:
bs_dict = {k:v for (k,v) in zip(sv.bs_flag,sv.bs)}

In [19]:
df_num.iloc[0,0] = bs_dict[98]
df_num.iloc[3:,0] = bs_dict[98]
df_num.iloc[1,3] = bs_dict[8]

In [20]:
df_num

Unnamed: 0_level_0,sphrs1,age,agekdbrn,chldidel,coninc,educom
Unnamed: 0_level_1,median,median,median,median,median,median
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
0,Not applicable,34,22,4.5,6633.5,12.333333
1,51,35,25,As many as want,51705.0,12.666667
2,60,36,25,4,49882.5,12.333333
3,Not applicable,31,19,5,8313.75,12.0
4,Not applicable,33,22,4,9486.5625,12.333333
5,Not applicable,34,19,4,18745.0,11.666667


In [21]:
cat_cols = list(set(df.columns) - set(sv.num_cols) - set(['cluster']))

## Unemployed Clusters

### Cluster 0

In [22]:
c0 = fc_df[fc_df.cluster == 0]

In [23]:
sort_feature_means(c0,10)

[(-0.10317363862869754, 'coninc'),
 (-0.03392890421279629, 'age'),
 (-0.027697984375685748, 'class_'),
 (-0.023296867668049392, 'agekdbrn'),
 (-0.019737805836251204, 'polviews'),
 (-0.019515172650691602, 'educom'),
 (0.018331218255143, 'degree'),
 (-0.01765954711605959, 'finrela'),
 (-0.016752126734590513, 'partyid'),
 (-0.014696917162165184, 'dwelling')]

In [24]:
df_c0 = df[df.cluster ==0]

In [77]:
a = df_c0[cat_cols].apply(lambda x: rank_1st(x))

In [78]:
b= df_c0[cat_cols].apply(lambda x: rank_2nd(x))

In [95]:
d = pd.DataFrame(pd.concat([a,b],axis=1))

In [97]:
d.columns = ['first','second/']

In [98]:
d

Unnamed: 0,first,second/
fechld,"(Not applicable, 0.37)","(Agree, 0.3)"
natrace,"(Not applicable, 0.54)","(Too little, 0.19)"
socfrend,"(Not applicable, 0.39)","(Sev times a week, 0.14)"
reliten,"(Not very strong, 0.41)","(Strong, 0.32)"
res16,"(Town lt 50000, 0.32)","(50000 to 250000, 0.18)"
divorce,"(Not applicable, 0.62)","(No, 0.31)"
polviews,"(Moderate, 0.36)","(Slghtly conservative, 0.12)"
fefam,"(Not applicable, 0.37)","(Disagree, 0.29)"
socbar,"(Not applicable, 0.39)","(Never, 0.33)"
partyid,"(Independent, 0.29)","(Not str democrat, 0.21)"
