In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
from sklearn import tree

## Step 1. Data Collection

In [2]:
# Reading dataset into dta
dta = pd.read_csv('original_training_set/train_balanced.csv')

## Step 2. Splitting data into training set and testing set

In [3]:
train, test = train_test_split(dta, test_size = 0.2)
train.to_csv('new_train_data.csv')
test.to_csv('new_test_data.csv')

In [4]:
dta.isnull().values.sum()

0

In [5]:
dta.shape

(665349, 28)

## Data of both training set and testing set

In [6]:
train_df1=pd.read_csv('new_train_data.csv')
train_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,402021,10092376,3,0,0,11:45,NY,13267,1,1,...,0,1,1,1,0,2,750,0,3,1
1,251616,10058111,1,0,3,12:14,NV,14901,3,0,...,0,1,3,0,1,4,650,0,3,2
2,319912,10073493,6,0,3,15:48,FL,14504,1,0,...,0,1,3,1,2,3,625,0,4,2
3,203,10000054,3,1,2,09:05,FL,10055,2,1,...,1,1,2,1,2,3,614,0,3,1
4,268457,10061871,5,0,3,09:04,OH,12511,1,0,...,0,1,3,0,2,3,591,0,3,1


In [7]:
test_df1=pd.read_csv('new_test_data.csv')
test_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,407396,10093608,10,0,1,16:06,AL,13616,2,0,...,1,2,3,1,1,1,621,0,3,3
1,304039,10069974,2,0,0,09:49,GA,11718,2,1,...,0,4,2,0,2,2,614,0,3,1
2,181297,10041975,1,0,1,05:59,PA,15224,1,1,...,0,1,1,0,0,2,554,0,3,1
3,633313,10145530,2,0,4,12:15,CO,12508,1,1,...,1,3,3,1,2,4,606,0,3,2
4,564868,10129793,2,0,3,15:27,TN,16510,1,0,...,1,2,2,1,1,2,594,0,3,2


## Pre-processing data

### Removing the Unnamed column from the new training set

In [8]:
train_df1.shape

(532279, 29)

In [9]:
train_df1.drop(train_df1.columns[0],axis=1, inplace=True)

In [10]:
train_df1.shape

(532279, 28)

In [11]:
train_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10092376,3,0,0,11:45,NY,13267,1,1,10,...,0,1,1,1,0,2,750,0,3,1
1,10058111,1,0,3,12:14,NV,14901,3,0,9,...,0,1,3,0,1,4,650,0,3,2
2,10073493,6,0,3,15:48,FL,14504,1,0,14,...,0,1,3,1,2,3,625,0,4,2
3,10000054,3,1,2,09:05,FL,10055,2,1,7,...,1,1,2,1,2,3,614,0,3,1
4,10061871,5,0,3,09:04,OH,12511,1,0,11,...,0,1,3,0,2,3,591,0,3,1


In [12]:
test_df1.shape

(133070, 29)

In [13]:
test_df1.drop(test_df1.columns[0],axis=1, inplace=True)

In [14]:
test_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10093608,10,0,1,16:06,AL,13616,2,0,5,...,1,2,3,1,1,1,621,0,3,3
1,10069974,2,0,0,09:49,GA,11718,2,1,14,...,0,4,2,0,2,2,614,0,3,1
2,10041975,1,0,1,05:59,PA,15224,1,1,8,...,0,1,1,0,0,2,554,0,3,1
3,10145530,2,0,4,12:15,CO,12508,1,1,10,...,1,3,3,1,2,4,606,0,3,2
4,10129793,2,0,3,15:27,TN,16510,1,0,11,...,1,2,2,1,1,2,594,0,3,2


### Creating response vector and feature set

In [15]:
test_df1.isnull().values.sum()

0

In [16]:
train_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost', 'weekend_indicator', 'accident_risk', 'time_factor'],
      dtype='object')

In [17]:
# creating intermediate dataframe which includes feature set of customer_ID and shopping_pt
train_features_df1=pd.DataFrame(np.array(train_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
#train_features_df1=pd.DataFrame(np.array(train_df1[['customer_ID']]) , columns=['customer_ID'] )
train_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10092376,3
1,10058111,1
2,10073493,6
3,10000054,3
4,10061871,5


In [18]:
# creating intermediate dataframe which includes feature set of other required columns

train_features_df2=train_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
train_features_df2['car_value']=train_features_df2['car_value'].astype('category')

In [19]:
# converting categorical values to numeric values which will be used for prediction
train_features_df2['car_value']=train_features_df2['car_value'].cat.codes

In [20]:
train_features_df2['car_value'].head()

0    6
1    4
2    4
3    5
4    4
Name: car_value, dtype: int8

In [21]:
train_features_df2.head()

Unnamed: 0,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,C_previous,duration_previous,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,1,1,10,6,1.0,50,50,0,1,5,...,0,1,1,1,0,2,750,0,3,1
1,3,0,9,4,3.011655,48,20,1,1,13,...,0,1,3,0,1,4,650,0,3,2
2,1,0,14,4,1.0,75,75,0,1,6,...,0,1,3,1,2,3,625,0,4,2
3,2,1,7,5,1.0,75,75,1,1,3,...,1,1,2,1,2,3,614,0,3,1
4,1,0,11,4,4.0,35,35,0,1,12,...,0,1,3,0,2,3,591,0,3,1


In [22]:
# creating response vector to train our model
response_vector=pd.DataFrame(np.array(train_df1['record_type']),columns=['record_type'])
#response_vector.loc[response_vector['record_type']==1]
response_vector.head(30)

Unnamed: 0,record_type
0,0
1,0
2,0
3,1
4,0
5,0
6,0
7,0
8,1
9,0


In [23]:
# creating final feature set dataframe
train_features_set=pd.concat([train_features_df1,train_features_df2],axis =1)
train_features_set.shape

(532279, 23)

In [24]:
train_features_set.head(10)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10092376,3,1,1,10,6,1.0,50,50,0,...,0,1,1,1,0,2,750,0,3,1
1,10058111,1,3,0,9,4,3.011655,48,20,1,...,0,1,3,0,1,4,650,0,3,2
2,10073493,6,1,0,14,4,1.0,75,75,0,...,0,1,3,1,2,3,625,0,4,2
3,10000054,3,2,1,7,5,1.0,75,75,1,...,1,1,2,1,2,3,614,0,3,1
4,10061871,5,1,0,11,4,4.0,35,35,0,...,0,1,3,0,2,3,591,0,3,1
5,10106648,6,1,1,3,4,1.0,49,49,0,...,0,3,2,1,1,3,630,0,3,2
6,10025251,2,1,1,9,5,3.098285,71,71,0,...,0,2,3,0,2,1,586,0,4,2
7,10047326,7,1,0,9,5,3.011655,24,24,0,...,1,4,3,1,1,3,678,0,4,2
8,10054722,9,2,1,6,3,3.1159,23,18,1,...,1,1,1,0,3,2,695,0,3,2
9,10133737,3,1,1,17,5,4.0,20,20,0,...,0,3,3,0,0,2,654,0,4,1


In [25]:
#Creating intermediate test features
test_features_df1=pd.DataFrame(np.array(test_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
#test_features_df1=pd.DataFrame(np.array(test_df1[['customer_ID']]) , columns=['customer_ID'] )
test_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10093608,10
1,10069974,2
2,10041975,1
3,10145530,2
4,10129793,2


In [26]:
test_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost', 'weekend_indicator', 'accident_risk', 'time_factor'],
      dtype='object')

In [27]:
# creating intermediate dataframe which includes feature set of other required columns
test_features_df2=test_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
test_features_df2['car_value']=test_features_df2['car_value'].astype('category')

In [28]:
#converting car value from categorical to numeric
test_features_df2['car_value']=test_features_df2['car_value'].cat.codes

In [29]:
# doing the same procedure as above for the test set
test_features_set=pd.concat([test_features_df1,test_features_df2],axis=1)
test_features_set.shape

(133070, 23)

In [30]:
test_features_set.head(15)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10093608,10,2,0,5,4,4.0,25,24,1,...,1,2,3,1,1,1,621,0,3,3
1,10069974,2,2,1,14,4,2.0,75,74,1,...,0,4,2,0,2,2,614,0,3,1
2,10041975,1,1,1,8,5,3.07013,65,65,0,...,0,1,1,0,0,2,554,0,3,1
3,10145530,2,1,1,10,4,4.0,30,30,0,...,1,3,3,1,2,4,606,0,3,2
4,10129793,2,1,0,11,4,3.10191,65,65,0,...,1,2,2,1,1,2,594,0,3,2
5,10116618,2,1,0,9,6,2.0,49,49,0,...,0,3,3,1,0,3,620,0,3,2
6,10041390,4,1,0,1,4,2.0,25,25,0,...,1,1,2,1,1,3,622,0,3,2
7,10042708,1,1,0,11,4,4.0,49,49,0,...,1,2,2,1,1,2,677,0,3,1
8,10014972,5,1,0,12,6,3.0,21,21,0,...,1,1,3,0,0,2,671,0,4,1
9,10107918,3,1,1,10,4,3.099142,75,75,0,...,0,1,1,0,2,3,602,0,4,2


## Modelling the data to predict the response using Logistic Regression

### Basic Steps to create a model is,
1. Data collection.
2. Data preprocessing:
    1) Data Cleansing.
    2) Data transformation (if required).
    3) Divide data into training and testing sets.
3. Build a model on training data.
4. Evaluate the model on the test data

### Step 1 and  Step 2 are done as shown above

In [31]:
clf = tree.DecisionTreeClassifier(class_weight='balanced', max_depth=3,min_samples_leaf=5, splitter='random')

### Step 3
### b) Fit the model

In [32]:
#clf=clf.fit(train_features_set,response_vector.values.ravel())
clf=clf.fit(train_features_set,response_vector.values.ravel())

In [33]:
classfier_score=clf.score(train_features_set,response_vector.values.ravel())
classfier_score

0.80026076550079939

In [34]:
#test_features_set.sort_values('customer_ID')

In [35]:
predict_purchase=clf.predict(test_features_set)
ctr=clf.predict(test_features_set).sum()
ctr

23490

In [36]:
customer_information = pd.DataFrame(columns=['customer_ID', 'record_type'])
policy_options=test_df1.ix[:,17:24]
customer_information['customer_ID'] = test_df1.customer_ID
customer_information['record_type'] = predict_purchase.astype(int)
predicted_output=pd.concat([customer_information,policy_options],axis=1)
predicted_output.to_csv('decisiontree.csv',index=False)


In [37]:
test_customer_information=pd.DataFrame(columns=['customer_ID', 'record_type'])
test_customer_information['customer_ID'] = test_df1.customer_ID
test_customer_information['record_type']=test_df1.record_type
expected_output=pd.concat([test_customer_information,test_df1.ix[:,17:24]], axis=1)
expected_output.to_csv('expectedOutput1.csv', index=False)

In [38]:
lr=pd.read_csv('decisiontree.csv')
lr.head()

Unnamed: 0,customer_ID,record_type,A,B,C,D,E,F,G
0,10093608,1,1,1,2,3,1,1,1
1,10069974,0,1,0,4,2,0,2,2
2,10041975,0,0,0,1,1,0,0,2
3,10145530,0,2,1,3,3,1,2,4
4,10129793,0,1,1,2,2,1,1,2


In [39]:
exp=pd.read_csv('expectedOutput1.csv')
exp.loc[exp['record_type']==1].shape

(19414, 9)

In [40]:
lr.loc[exp['record_type']==0].shape

(113656, 9)

In [41]:
expected=test_df1.record_type


In [42]:

print (classification_report(expected, lr.record_type))


             precision    recall  f1-score   support

          0       0.90      0.87      0.88    113656
          1       0.35      0.42      0.38     19414

avg / total       0.82      0.80      0.81    133070



In [43]:
print (recall_score(expected, lr.record_type))

0.421757494592


In [44]:
print (accuracy_score(expected, lr.record_type))

0.800646276396
