In [42]:
# Decision Tree Regression

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import Imputer

In [43]:
# Importing the dataset
dataset_1 = pd.read_csv('cleaned_hm.csv')
dataset_2 = pd.read_csv('demographic.csv')

In [44]:
dataset_2.head()

Unnamed: 0,wid,age,country,gender,marital,parenthood
0,1,37.0,USA,m,married,y
1,2,29.0,IND,m,married,y
2,3,25.0,IND,m,single,n
3,4,32.0,USA,m,married,y
4,5,29.0,USA,m,married,y


In [45]:
#Number of countries
len(dataset_2.country.unique())

101

In [46]:
dataset_2.marital.unique()

array(['married', 'single', 'divorced', 'separated', 'widowed', nan], dtype=object)

In [47]:
len(dataset_2.marital.unique())
#We can impute nan values using mode

6

In [48]:
dataset_2.parenthood.unique()
#We can impute nan values using mode

array(['y', 'n', nan], dtype=object)

In [49]:
dataset_2.gender.unique()
#We can impute nan values using mode

array(['m', 'f', 'o', nan], dtype=object)

In [50]:
dataset_2.count()

wid           10844
age           10809
country       10771
gender        10812
marital       10787
parenthood    10813
dtype: int64

In [51]:
#demographics contains NaN values so We need to impute them

In [52]:
hm_new = pd.merge(dataset_1 , dataset_2)

In [53]:
hm_new.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category,age,country,gender,marital,parenthood
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection,35,USA,m,single,n
1,27873,2053,24h,I played a new game that was fun and got to en...,I played a new game that was fun and got to en...,True,1,,leisure,35,USA,m,single,n
2,28073,2053,24h,I listened to some music and heard an entire a...,I listened to some music and heard an entire a...,True,1,,leisure,35,USA,m,single,n
3,33522,2053,24h,Went to see a movie with my friend,Went to see a movie with my friend,True,1,,bonding,35,USA,m,single,n
4,34522,2053,24h,"Played guitar, learning a song on it","Played guitar, learning a song on it",True,1,,leisure,35,USA,m,single,n


In [54]:
hm_new.drop(['original_hm', 'cleaned_hm','modified','num_sentence','ground_truth_category','reflection_period','hmid', 'wid'], axis=1, inplace=True)

In [55]:
hm_new.head()

Unnamed: 0,predicted_category,age,country,gender,marital,parenthood
0,affection,35,USA,m,single,n
1,leisure,35,USA,m,single,n
2,leisure,35,USA,m,single,n
3,bonding,35,USA,m,single,n
4,leisure,35,USA,m,single,n


In [56]:
hm_new.count()

predicted_category    100535
age                   100442
country               100332
gender                100456
marital               100378
parenthood            100457
dtype: int64

In [57]:
hm_dataset = hm_new.as_matrix()
hm_dataset = np.append(arr = hm_dataset[:, 1:], values = hm_dataset[:, [0]], axis = 1)

In [58]:
hm_dataset

array([['35', 'USA', 'm', 'single', 'n', 'affection'],
       ['35', 'USA', 'm', 'single', 'n', 'leisure'],
       ['35', 'USA', 'm', 'single', 'n', 'leisure'],
       ..., 
       ['65', 'IND', 'm', 'married', 'y', 'achievement'],
       ['65', 'IND', 'm', 'married', 'y', 'bonding'],
       ['65', 'IND', 'm', 'married', 'y', 'enjoy_the_moment']], dtype=object)

In [61]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
hm_dataset[:, 2] = labelencoder.fit_transform(hm_dataset[:, 2])
#imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
#dataset_2['gender'] = pd.DataFrame(imp.fit_transform(dataset_2['gender']))
#imputed_DF.columns = dataset_2.columns
#imputed_DF.index = dataset_2.index

TypeError: '<' not supported between instances of 'str' and 'float'

In [None]:
X = hm_dataset[:, :5]
y = hm_dataset[:, 5]

In [None]:
X

In [None]:
y

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
def plot_histograms( df , variables , n_rows , n_cols ):
    fig = plt.figure( figsize = ( 16 , 12 ) )
    for i, var_name in enumerate( variables ):
        ax=fig.add_subplot( n_rows , n_cols , i+1 )
        df[ var_name ].hist( bins=10 , ax=ax )
        ax.set_title( 'Skew: ' + str( round( float( df[ var_name ].skew() ) , ) ) ) # + ' ' + var_name ) #var_name+" Distribution")
        ax.set_xticklabels( [] , visible=False )
        ax.set_yticklabels( [] , visible=False )
    fig.tight_layout()  # Improves appearance a bit.
    plt.show()

def plot_distribution( df , var , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , hue=target , aspect=4 , row = row , col = col )
    facet.map( sns.kdeplot , var , shade= True )
    facet.set( xlim=( 0 , df[ var ].max() ) )
    facet.add_legend()

def plot_categories( df , cat , target , **kwargs ):
    row = kwargs.get( 'row' , None )
    col = kwargs.get( 'col' , None )
    facet = sns.FacetGrid( df , row = row , col = col )
    facet.map( sns.barplot , cat , target )
    facet.add_legend()

def plot_correlation_map( df ):
    corr = titanic.corr()
    _ , ax = plt.subplots( figsize =( 12 , 10 ) )
    cmap = sns.diverging_palette( 220 , 10 , as_cmap = True )
    _ = sns.heatmap(
        corr, 
        cmap = cmap,
        square=True, 
        cbar_kws={ 'shrink' : .9 }, 
        ax=ax, 
        annot = True, 
        annot_kws = { 'fontsize' : 12 }
    )

def describe_more( df ):
    var = [] ; l = [] ; t = []
    for x in df:
        var.append( x )
        l.append( len( pd.value_counts( df[ x ] ) ) )
        t.append( df[ x ].dtypes )
    levels = pd.DataFrame( { 'Variable' : var , 'Levels' : l , 'Datatype' : t } )
    levels.sort_values( by = 'Levels' , inplace = True )
    return levels

def plot_variable_importance( X , y ):
    tree = DecisionTreeClassifier( random_state = 99 )
    tree.fit( X , y )
    plot_model_var_imp( tree , X , y )
    
def plot_model_var_imp( model , X , y ):
    imp = pd.DataFrame( 
        model.feature_importances_  , 
        columns = [ 'Importance' ] , 
        index = X.columns 
    )
    imp = imp.sort_values( [ 'Importance' ] , ascending = True )
    imp[ : 10 ].plot( kind = 'barh' )
    print (model.score( X , y ))