In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from matplotlib import pyplot as plt

## Step 1. Data Collection

In [4]:
# Reading dataset into dta
dta = pd.read_csv('original_training_set/train.csv')



## Step 2. Splitting data into training set and testing set

In [5]:
dta.isnull().values.sum()

0

## Data of both training set and testing set

In [6]:
dta.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10000000,1,0,0,08:35,IN,10001,2,0,2,...,1,2,1,0,2,2,1,2,2,633
1,10000000,2,0,0,08:38,IN,10001,2,0,2,...,1,2,1,0,2,2,1,2,1,630
2,10000000,3,0,0,08:38,IN,10001,2,0,2,...,1,2,1,0,2,2,1,2,1,630
3,10000000,4,0,0,08:39,IN,10001,2,0,2,...,1,2,1,0,2,2,1,2,1,630
4,10000000,5,0,0,11:55,IN,10001,2,0,2,...,1,2,1,0,2,2,1,2,1,630


In [7]:
test_df1=pd.read_csv('original_testing_set/test_v2.csv')
test_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10000001,1,0,1,12:35,OK,10002.0,1,0,9,...,3.0,9.0,0,0,1,1,0,0,4,543
1,10000001,2,0,1,12:36,OK,10002.0,1,0,9,...,3.0,9.0,2,1,1,3,1,3,2,611
2,10000002,1,0,4,12:19,PA,10003.0,1,1,7,...,2.0,15.0,2,0,2,3,1,2,2,691
3,10000002,2,0,4,12:21,PA,10003.0,1,1,7,...,2.0,15.0,2,0,2,3,1,2,2,695
4,10000003,1,0,3,17:12,AR,10004.0,1,0,4,...,3.0,1.0,1,0,1,1,0,2,2,628


## Pre-processing data

### Removing the Unnamed column from the new training set

In [7]:
train_df1.shape



(532199, 26)

In [8]:
train_df1.drop(train_df1.columns[0],axis=1, inplace=True)


In [9]:
train_df1.shape

(532199, 25)

In [10]:
train_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10020245,2,0,4,12:47,PA,11431,1,0,10,...,3,7,2,0,2,3,0,1,4,598
1,10035049,6,0,4,16:23,PA,13742,1,1,14,...,1,4,0,1,3,3,0,0,3,571
2,10026464,3,0,4,16:05,OH,15468,1,0,9,...,3,15,1,0,3,2,0,1,3,573
3,10094135,8,1,3,10:36,CT,10772,1,0,9,...,1,1,0,0,1,3,0,0,1,616
4,10128898,5,0,0,08:41,GA,14906,2,0,12,...,1,1,0,1,2,2,0,0,2,589


In [11]:
test_df1.shape

(133050, 26)

In [12]:
test_df1.drop(test_df1.columns[0],axis=1, inplace=True)

In [13]:
test_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10044343,6,0,2,09:42,OH,10627,1,1,5,...,3,7,1,0,3,3,1,2,3,616
1,10030877,1,0,2,15:44,NY,12572,1,1,17,...,4,11,2,1,4,3,1,0,4,683
2,10135888,4,0,3,11:59,NM,14897,2,0,1,...,1,15,1,1,1,1,0,2,4,663
3,10055334,10,1,2,21:08,NV,12680,2,1,11,...,2,10,0,0,2,2,0,0,2,567
4,10034715,2,0,0,10:02,OK,14647,1,0,20,...,1,2,0,0,1,1,0,0,1,618


### Creating response vector and feature set

In [14]:
test_df1.isnull().values.sum()

0

In [15]:
train_df1.columns

Index([u'customer_ID', u'shopping_pt', u'record_type', u'day', u'time',
       u'state', u'location', u'group_size', u'homeowner', u'car_age',
       u'car_value', u'risk_factor', u'age_oldest', u'age_youngest',
       u'married_couple', u'C_previous', u'duration_previous', u'A', u'B',
       u'C', u'D', u'E', u'F', u'G', u'cost'],
      dtype='object')

In [16]:
# creating intermediate dataframe which includes feature set of customer_ID and shopping_pt
train_features_df1=pd.DataFrame(np.array(train_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
train_features_df1.head()


Unnamed: 0,customer_ID,shopping_pt
0,10020245,2
1,10035049,6
2,10026464,3
3,10094135,8
4,10128898,5


In [17]:
# creating intermediate dataframe which includes feature set of other required columns

train_features_df2=train_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
train_features_df2['car_value']=train_features_df2['car_value'].astype('category')


In [18]:
# converting categorical values to numeric values which will be used for prediction
train_features_df2['car_value']=train_features_df2['car_value'].cat.codes

In [19]:
train_features_df2['car_value'].head()

0    4
1    4
2    5
3    4
4    5
Name: car_value, dtype: int8

In [20]:
# creating response vector to train our model
response_vector=pd.DataFrame(np.array(train_df1['record_type']),columns=['record_type'])
#response_vector.loc[response_vector['record_type']==1]
response_vector.head(30)

Unnamed: 0,record_type
0,0
1,0
2,0
3,1
4,0
5,0
6,0
7,0
8,1
9,0


In [21]:
# creating final feature set dataframe
train_features_set=pd.concat([train_features_df1,train_features_df2],axis =1)
train_features_set.shape

(532199, 20)

In [22]:
train_features_set.head(10)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10020245,2,1,0,10,4,4,26,26,0,3,7,2,0,2,3,0,1,4,598
1,10035049,6,1,1,14,4,2,34,34,0,1,4,0,1,3,3,0,0,3,571
2,10026464,3,1,0,9,5,1,69,69,0,3,15,1,0,3,2,0,1,3,573
3,10094135,8,1,0,9,4,3,24,24,0,1,1,0,0,1,3,0,0,1,616
4,10128898,5,2,0,12,5,3,55,53,1,1,1,0,1,2,2,0,0,2,589
5,10103325,5,1,1,11,6,1,59,59,0,3,15,1,1,3,3,1,1,3,642
6,10111225,6,1,1,15,4,4,57,53,1,1,2,0,0,1,2,0,0,2,563
7,10080575,2,1,1,7,4,1,75,54,0,2,1,1,0,3,3,0,2,3,592
8,10057503,7,1,1,6,4,4,46,46,0,3,2,1,0,3,2,0,2,2,633
9,10151526,3,2,1,12,6,1,53,50,1,2,4,1,1,2,2,1,1,2,679


In [23]:
#Creating intermediate test features
test_features_df1=pd.DataFrame(np.array(test_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
test_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10044343,6
1,10030877,1
2,10135888,4
3,10055334,10
4,10034715,2


In [24]:
test_df1.columns

Index([u'customer_ID', u'shopping_pt', u'record_type', u'day', u'time',
       u'state', u'location', u'group_size', u'homeowner', u'car_age',
       u'car_value', u'risk_factor', u'age_oldest', u'age_youngest',
       u'married_couple', u'C_previous', u'duration_previous', u'A', u'B',
       u'C', u'D', u'E', u'F', u'G', u'cost'],
      dtype='object')

In [25]:
# creating intermediate dataframe which includes feature set of other required columns

test_features_df2=test_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
test_features_df2['car_value']=test_features_df2['car_value'].astype('category')

In [26]:
#converting car value from categorical to numeric
test_features_df2['car_value']=test_features_df2['car_value'].cat.codes

In [27]:
test_features_set=pd.concat([test_features_df1,test_features_df2],axis=1)
test_features_set.shape

(133050, 20)

In [28]:
test_features_set.head(15)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10044343,6,1,1,5,4,1,45,45,0,3,7,1,0,3,3,1,2,3,616
1,10030877,1,1,1,17,5,2,75,74,1,4,11,2,1,4,3,1,0,4,683
2,10135888,4,2,0,1,6,4,31,28,1,1,15,1,1,1,1,0,2,4,663
3,10055334,10,2,1,11,4,1,52,45,1,2,10,0,0,2,2,0,0,2,567
4,10034715,2,1,0,20,4,1,33,33,0,1,2,0,0,1,1,0,0,1,618
5,10079365,6,1,1,2,5,3,73,64,1,3,1,2,0,3,2,0,3,4,600
6,10004362,3,1,1,11,4,1,68,68,0,3,1,0,0,1,2,0,0,2,662
7,10037612,2,1,0,7,6,1,41,41,0,4,2,0,0,4,3,0,2,4,627
8,10022393,4,1,0,5,4,1,21,21,0,1,1,2,0,1,1,0,2,2,769
9,10045730,2,1,1,12,4,1,57,56,1,3,3,0,0,1,3,0,0,1,576


## Modelling the data to predict the response using Random Forest

### Basic Steps to create a model is,
1. Data collection.
2. Data preprocessing:
    1) Data Cleaning.
    2) Data transformation (if required).
    3) Divide data into training and testing sets.
3. Build a model on training data.
4. Evaluate the model on the test data

### Step 1 and  Step 2 are done as shown above

### Step 3
### a) Initialize the classifier 

In [29]:
random_forest_classfifer=RandomForestClassifier()

### Step 3
### b) Fit the model

In [30]:
random_forest_classfifer.fit(train_features_set,response_vector.values.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [31]:
classfier_score=random_forest_classfifer.score(train_features_set,response_vector.values.ravel())
classfier_score

0.98576096535318558

In [32]:
#test_features_set.sort_values('customer_ID')

In [33]:
predict_purchase=random_forest_classfifer.predict(test_features_set)

In [34]:
customer_information = pd.DataFrame(columns=['customer_ID', 'record_type'])
policy_options=test_df1.ix[:,17:24]
customer_information['customer_ID'] = test_df1.customer_ID
customer_information['record_type'] = predict_purchase.astype(int)
predicted_output=pd.concat([customer_information,policy_options],axis=1)
predicted_output.to_csv('randomForest.csv',index=False)


In [35]:
test_customer_information=pd.DataFrame(columns=['customer_ID', 'record_type'])
test_customer_information['customer_ID'] = test_df1.customer_ID
test_customer_information['record_type']=test_df1.record_type
expected_output=pd.concat([test_customer_information,test_df1.ix[:,17:24]], axis=1)
expected_output.to_csv('expectedOutput.csv', index=False)

In [36]:
rand=pd.read_csv('randomForest.csv')
rand.head()

Unnamed: 0,customer_ID,record_type,A,B,C,D,E,F,G
0,10044343,0,1,0,3,3,1,2,3
1,10030877,0,2,1,4,3,1,0,4
2,10135888,0,1,1,1,1,0,2,4
3,10055334,1,0,0,2,2,0,0,2
4,10034715,0,0,0,1,1,0,0,1


In [44]:
exp=pd.read_csv('expectedOutput.csv')
exp.loc[exp['record_type']==1].shape

(19337, 9)

In [42]:
rand.loc[exp['record_type']==0].shape

(113713, 9)

In [39]:
expected=test_df1.record_type


In [40]:
print classification_report(expected, rand.record_type)


             precision    recall  f1-score   support

          0       0.87      0.97      0.92    113713
          1       0.51      0.17      0.25     19337

avg / total       0.82      0.86      0.82    133050

