In [1]:
from __future__ import division
import pandas as pd
import numpy as np 
import matplotlib.pylab as plt
import seaborn as sns

from code.organize import *
from code.roughEDA import *
from code.survey_processor_full import *
from code.model_vis import *
from code.fc import *
from code.fc_cluster import *
from collections import Counter,defaultdict,OrderedDict

from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix,roc_curve
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

from treeinterpreter import treeinterpreter as ti
import matplotlib.cm as cm
import cPickle as pickle
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Feature Contribution Analysis

In this session, decision path in random forest decision making will be analysed. 
## 1. Load feature contribution matrix and group by cluster id

In [2]:
fc = FeatureContribution('data/fc_df2.pkl')
fc_df = fc.cluster_processor(3,6)

In [3]:
summary = fc_df.groupby('cluster')['employed'].agg([np.mean,np.size])
summary.columns = ['employment_rate','sample_size']
summary

Unnamed: 0_level_0,employment_rate,sample_size
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.924484,2132
1,0.082192,365
2,0.040498,642
3,0.900459,653
4,0.895753,259
5,0.059809,418


**Cluster 1,2,5** are unemployed groups. **Cluster 0,3,4** are employed groups. 


## 2.  Load original survey data

In [4]:
sv = survey('data/survey2')
sv.num_processor()

In [5]:
# load suvery data, conbim numerical and catigorical anwers
df = pd.concat([sv.fin_data[sv.num_cols],sv.data[sv.sv_cols],sv.fin_data.employed],axis =1) 

# 40 important features and cluster id
cols = list(fc.features)
cols.append('cluster')

# subset 40 important features
df = df[fc.features]

#add cluster_id column
df = pd.DataFrame(np.column_stack((df,fc.cluster_id)),columns =cols)

About half respondents did not answer spouse working hours or ideal number of kids. It's not accurate to take mean or median without removing the flag. Thereby, I'll remove respondents who didn't answer these two questions, then take median values. 

Age, agekdbrn and coninc maintain the median values. Educom uses mean because it further split the clusters. 

In [6]:
dfn = df_num(df)

In [7]:
dfn

Unnamed: 0_level_0,sphrs1,chldidel,age,agekdbrn,coninc,educom
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,42,2.0,36,25,49882.5,12.62
1,40,3.0,31,19,8617.5,10.8
2,50,2.5,35,25,52811.0,12.94
3,40,2.0,33,21,9033.75,12.28
4,40,3.0,34,19,18745.0,10.44
5,40,3.0,34,22,7124.0,12.08


**Categorical variables**

Lots of "Not applicable" in mode answers. Use secondary mode answer instead if "not applicable" shows up.

In [8]:
# categorical columns
cat_cols = list(set(df.columns) - set(sv.num_cols) - set(['cluster']))

In [9]:
# groupby cluster, aggregate answers to take meaningful modes
df_cat = df.groupby('cluster')[cat_cols].agg([mode_answer_cat])
df_cat.columns = cat_cols

In [10]:
df_summary = pd.concat([dfn,df_cat],axis =1)

In [None]:
df_summary.sort('employed').T

### The summaried data frame is generated.  Pickle survey data frame with cluster id and this summary data frame for future study

In [13]:
with open('data/df_id.pkl','w') as f:
    pickle.dump(df,f)

In [15]:
with open('data/df_summary.pkl','w') as f:
    pickle.dump(df_summary,f)

### Generate a heatmap

In [None]:
heat_df = pd.concat([sv.fin_data[sv.num_cols],sv.fin_data[sv.sv_cols],sv.fin_data.employed],axis =1)

f40 = fc_df.columns[:41]
heat_df40= heat_df[f40]
l = list(heat_df40.columns)
l.append('cluster')

new = np.column_stack((heat_df40.values,fc_df.cluster))
heat_df = pd.DataFrame(new,columns=l)

In [None]:
heat_df_n = df_num(heat_df)
heat_df_cat = heat_df.groupby('cluster')[cat_cols].agg([mode_answer])
heat_df_cat.columns = cat_cols
heat_df_sum = pd.concat([heat_df_n,heat_df_cat],axis =1)

In [None]:
from sklearn.preprocessing import scale
heat_df_sum[cat_cols] = heat_df_sum[cat_cols].apply(lambda x: x.astype('float'))
heat_df_sum[sv.num_cols] = scale(heat_df_sum[sv.num_cols])
heat_df_sum.sort('employed',inplace=True)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(heat_df_sum.T,annot=True)

To distinguish clusters, we need to drop the features have the same values across clusters. 

In [None]:
redu_heat_df = drop_feature(heat_df_sum).T
redu_heat_df = redu_heat_df.reindex(['sphrs1','chldidel','age','agekdbrn','coninc','educom','affrmact',
                                     'fechld','mobile16','natrace','natspac', 'reliten', 'natfare','divlaw',
                                      'partyid', 'finrela', 'helpoth', 'degree','socfrend', 'fund', 'workhard',
                                     'marital', 'kidssol', 'class_', 'parsol','employed'])

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(redu_heat_df,annot=True)

Now it is easier to check. 
## to do :check cose similiarities. 

## 3. Unemployed Clusters

In [None]:
unemp_heat = redu_heat_df[redu_heat_df.columns[:3]]

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(unemp_heat,annot=True)

### Cluster 1

In [None]:
c1 = fc_df[fc_df.cluster == 1]

These features are the top 10 features driving the decision.

In [None]:
sort_feature_means(c1,10)

#### Details in their answers

In [None]:
df_c1 = df[df.cluster ==1]

In [None]:
df_summary[redu_heat_df.index].ix[1]

### population dwelling

In [None]:
simple_pie(df.dwelling,(4,4))

### cluster 1 dwelling

In [None]:
simple_pie(df_c1.dwelling,(4,4))

### cluster 1 spend evening with friends

In [None]:
simple_pie(df_c1.socfrend.replace(sv.bs,np.nan),(4,4))

## population spend evenings with friends

In [None]:
simple_pie(df.socfrend.replace(sv.bs,np.nan),(4,4))

## population political views

In [None]:
simple_pie(df.polviews,(3,3))

### cluster1 political views

In [None]:
simple_pie(df_c1.polviews,(3,3))

In [None]:
#population class_
simple_pie(df.class_,(3,3))

In [None]:
#cluster 1 class_
simple_pie(df_c1.class_,(3,3))

In [None]:
## Cluster 1 Degree
simple_pie(df_c1.degree,(3,3))

In [None]:
## population income 16
simple_pie(df.incom16.replace(sv.bs,np.nan),(3,3))

In [None]:
# cluster 1 income 16
simple_pie(df_c1.incom16.replace(sv.bs,np.nan),(4,4))

## Women in cluster 1 have following characteristics:
  * They don't have high degree. all of them have less than high school degree. **Main decision driver**
  * Their median age of giving birth to their first child is 19, way lower than the entire population (23)
  * Their median family income is below population median and they consider their finanicial situations are below average. More of them consider themselve lower class than population. 
  * They grow up in poor families. More of them responded their family income when they're 16 are below or far below average.
  * When asking their political views, the proportion of "don't know" is higher than the public.
  * most of them answer they never spend evening with friends.
  
  ### Summary: They have children early, less educated, from less educated family and living in poor conditions.


### Cluster 2

In [None]:
c2 = fc_df[fc_df.cluster ==2]

Top 10 driving features

In [None]:
sort_feature_means(c2,10)

In [None]:
df_c2 = df[df.cluster ==2]

In [None]:
df_summary[redu_heat_df.index].ix[2]

In [None]:
cluster_summary(df_c1,cat_cols)

In [None]:
## population party id
simple_pie(df.partyid,(3,3))

In [None]:
## cluster 2 party id
simple_pie(df_c2.partyid,(3,3))

In [None]:
# population divorce
simple_pie(df.divorce,(3,3))

In [None]:
## cluster 2 divorce
simple_pie(df_c2.divorce,(3,3))

In [None]:
# population income 16
simple_pie(df.incom16.replace(sv.bs,np.nan),(3,3))

In [None]:
## cluster 2 income 16
simple_pie(df_c2.incom16.replace(sv.bs,np.nan),(3,3))

In [None]:
# population resliten
simple_pie(df.reliten,(3,3))

In [None]:
# cluster 2 resliten
simple_pie(df_c2.reliten,(3,3))

### Women in cluster 2 have flowing characteristics:
  * Their husbands work longer than population. Their median family incomes are higher than population.
  * They have high shool or college degrees and their family members' education years are also longer than the public.
  * Their most voted partid is "not strong republican", more conservative political views and strong affiliation to religions.
  * majority of them never divorced.
  * The more of them grow up in a family whose income are above average.
  
  ## summary: Women in this cluster grow up in rich family and currently living in a rich family. They are well educated and conservative. 

## Cluster 4

In [None]:
c4 = fc_df[fc_df.cluster ==4]

In [None]:
sort_feature_means(c4,10)

In [None]:
df_c5 = df[df.cluster ==5]

In [None]:
df_summary[redu_heat_df.index].ix[5]

In [None]:
## population class
simple_pie(df.class_,(3,3))

In [None]:
## cluster 5 class
simple_pie(df_c5.class_,(3,3))

In [None]:
#population degree
simple_pie(df.degree,(3,3))

In [None]:
# cluster 5 degree
simple_pie(df_c5.degree,(3,3))

In [None]:
#population dwelling
simple_pie(df.dwelling,(3,3))

In [None]:
#cluster 5 dwelling
simple_pie(df_c5.dwelling,(3,3))

### Women in cluster 5 have flowing characteristics:
  * They are from poor family. Their median family income are below that in population median.
  * They consider themselves are in working class or lower class. They think their family income is below average. 
  * They and their family don't have higher education. About high school degree. 
  * More of them living in apartment houses.
  * They are not religous
  
  ### summary: sort of in between cluster 1 and 2

## 4. Employed Class

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(redu_heat_df[redu_heat_df.columns[3:]],annot=True)

### Cluster 0

In [None]:
c0 = fc_df[fc_df.cluster ==0]

In [None]:
sort_feature_means(c0,10)

In [None]:
df_c0 = df[df.cluster==0]

In [None]:
df_summary[redu_heat_df.index].ix[0]

In [None]:
# population degree
simple_pie(df.degree,(3,3))

In [None]:
# cluster 0 degree
simple_pie(df_c0.degree,(3,3))

## Women in cluster 2 have the following characteratics:

  * Median family incomes are about the population median
  * They have high degrees (at least high school) **main driver**
  * Husbands also work full time, 42 hrs/wk
  * They don't have strong affiliation to religions.
  * Their median age of having first children is later than the population. 
  * They consider themselves working or middle class
  
  ## summary: having higher education degree, having kids late

## Cluster 3

In [None]:
c3 = fc_df[fc_df.cluster ==3]

In [None]:
sort_feature_means(c3,10)

In [None]:
df_c3 = df[df.cluster==3]

In [None]:
df_summary[redu_heat_df.index].ix[3]

In [None]:
# population degree
simple_pie(df.degree,(3,3))

In [None]:
# cluster 3 degree
simple_pie(df_c3.degree,(3,3))

In [None]:
## population class
simple_pie(df.class_,(3,3))

In [None]:
simple_pie(df_c3.class_,(3,3))

In [None]:
# Marital
simple_pie(df.marital,(3,3))

In [None]:
simple_pie(df_c3.marital,(3,3))

In [None]:
# Dwelling
simple_pie(df.dwelling,(3,3))

In [None]:
simple_pie(df_c3.dwelling,(3,3))

### Women in cluster 4 have following characteristics:
  * Family income way below population. They consider their finicial situation below average.
  * They have at least high school degree. and most of them with high school degree.
  * more of them consider themselves working class
  * Most of them are Never married or divorced. ** they are single moms. Main difference**
  * Moderate to conservative political views and strong affiliation to religions.
  
 ## summary: well educated, single moms. working class
  

## Cluster 4

In [None]:
c4 = fc_df[fc_df.cluster ==4]

In [None]:
sort_feature_means(c4,10)

In [None]:
df_c4 = df[df.cluster ==4]

In [None]:
df_summary[redu_heat_df.index].ix[4]

In [None]:
#dwelling
simple_pie(df.polviews,(4,4))

In [None]:
simple_pie(df_c4.polviews,(4,4))

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(redu_heat_df[[1,4]],annot=True)

In [None]:
simple_pie(df_c4.finrela,(4,4))

In [None]:
simple_pie(df.finrela,(4,4))

## Women in cluster 4 have following characterastics:
  * Have first kid early
  * Family has less eudcation than generage population
  * Don't have strong affiliation to religions
  * Moderate or have no interest in political views.
  * Less than high school degree
  * Low family income and consider their family income is below avereage
  ## Summary: Women in this cluster share lots of common with women in cluster 1, but they work. And maybe for the sake of they work, they sometimes spend evenings with friends.
  ## They strongly oppose to favor in hiring blacks to get rid of decriminations. And they think government spend too much on space exploration program. So they are more practical. 


## To do:

1. write a function to compare the cosine similarities among clusters. Compare the similar cluster, one emp, one unemp. and see what makes them different. 
2. Use data visualization tools to visualize the main decision drivers for each cluster.
3. Structuring some code.