In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.cross_validation import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from matplotlib import pyplot as plt
import scipy

## Step 1. Data Collection

In [2]:
# Reading dataset into dta
dta = pd.read_csv('original_training_set/train_manipulated.csv')
dta.shape

(665349, 28)

## Step 2. Splitting data into training set and testing set

In [3]:
train, test = train_test_split(dta, test_size = 0.2)
train.to_csv('new_train_data.csv')
test.to_csv('new_test_data.csv')

In [4]:
dta.isnull().values.sum()

0

## Data of both training set and testing set

In [5]:
train_df1=pd.read_csv('new_train_data.csv')
train_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,82103,10059678,4,1,0,14:40,MO,13555,1,0,...,1,2,2,1,2,3,609,0,4,2
1,470810,10081540,3,0,2,09:55,NY,10938,1,1,...,0,3,2,0,0,2,564,0,3,1
2,465733,10079689,2,0,0,13:45,MD,10346,1,0,...,1,2,2,1,2,2,648,0,3,2
3,256367,10003257,6,1,0,14:02,ID,12272,1,0,...,0,1,2,0,3,3,586,0,4,2
4,247371,10003791,2,0,1,10:21,WY,12502,1,0,...,0,2,2,0,2,2,658,0,4,1


In [6]:
test_df1=pd.read_csv('new_test_data.csv')
test_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,264506,10006374,4,0,3,13:34,FL,12627,1,1,...,0,3,3,0,0,3,584,0,3,2
1,396494,10054873,2,0,3,14:14,NY,10390,1,1,...,0,3,3,0,0,2,640,0,2,2
2,75491,10114914,1,0,2,13:45,MO,13555,1,0,...,1,3,3,1,2,2,624,0,3,2
3,444482,10072048,7,0,2,14:06,FL,13711,1,1,...,1,3,2,1,1,3,650,0,3,2
4,390457,10052772,5,0,1,08:21,FL,12136,1,0,...,1,1,2,1,2,3,659,0,3,1


## Pre-processing data

### Removing the Unnamed column from the new training set

In [7]:
train_df1.shape

(532279, 29)

In [8]:
train_df1.drop(train_df1.columns[0],axis=1, inplace=True)

In [9]:
train_df1.shape

(532279, 28)

In [10]:
train_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10059678,4,1,0,14:40,MO,13555,1,0,9,...,1,2,2,1,2,3,609,0,4,2
1,10081540,3,0,2,09:55,NY,10938,1,1,11,...,0,3,2,0,0,2,564,0,3,1
2,10079689,2,0,0,13:45,MD,10346,1,0,11,...,1,2,2,1,2,2,648,0,3,2
3,10003257,6,1,0,14:02,ID,12272,1,0,7,...,0,1,2,0,3,3,586,0,4,2
4,10003791,2,0,1,10:21,WY,12502,1,0,6,...,0,2,2,0,2,2,658,0,4,1


In [11]:
test_df1.shape

(133070, 29)

In [12]:
test_df1.drop(test_df1.columns[0],axis=1, inplace=True)

In [13]:
test_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10006374,4,0,3,13:34,FL,12627,1,1,17,...,0,3,3,0,0,3,584,0,3,2
1,10054873,2,0,3,14:14,NY,10390,1,1,9,...,0,3,3,0,0,2,640,0,2,2
2,10114914,1,0,2,13:45,MO,13555,1,0,13,...,1,3,3,1,2,2,624,0,3,2
3,10072048,7,0,2,14:06,FL,13711,1,1,3,...,1,3,2,1,1,3,650,0,3,2
4,10052772,5,0,1,08:21,FL,12136,1,0,9,...,1,1,2,1,2,3,659,0,3,1


### Creating response vector and feature set

In [14]:
test_df1.isnull().values.sum()

0

In [15]:
train_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost', 'weekend_indicator', 'accident_risk', 'time_factor'],
      dtype='object')

In [16]:
# creating intermediate dataframe which includes feature set of customer_ID and shopping_pt
train_features_df1=pd.DataFrame(np.array(train_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
train_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10059678,4
1,10081540,3
2,10079689,2
3,10003257,6
4,10003791,2


In [17]:
# creating intermediate dataframe which includes feature set of other required columns

train_features_df2=train_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
train_features_df2['car_value']=train_features_df2['car_value'].astype('category')

In [18]:
# converting categorical values to numeric values which will be used for prediction
train_features_df2['car_value']=train_features_df2['car_value'].cat.codes

In [19]:
train_features_df2['car_value'].head()

0    3
1    3
2    3
3    4
4    4
Name: car_value, dtype: int8

In [20]:
train_features_df2.head()

Unnamed: 0,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,C_previous,duration_previous,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,1,0,9,3,3.098285,22,22,0,3,0,...,1,2,2,1,2,3,609,0,4,2
1,1,1,11,3,1.0,50,50,0,3,12,...,0,3,2,0,0,2,564,0,3,1
2,1,0,11,3,4.0,63,63,0,2,5,...,1,2,2,1,2,2,648,0,3,2
3,1,0,7,4,2.0,24,24,0,3,1,...,0,1,2,0,3,3,586,0,4,2
4,1,0,6,4,3.085515,20,20,0,2,1,...,0,2,2,0,2,2,658,0,4,1


In [21]:
# creating response vector to train our model
response_vector=pd.DataFrame(np.array(train_df1['record_type']),columns=['record_type'])
#response_vector.loc[response_vector['record_type']==1]
response_vector.head(30)

Unnamed: 0,record_type
0,1
1,0
2,0
3,1
4,0
5,0
6,0
7,0
8,1
9,0


In [22]:
# creating final feature set dataframe
train_features_set=pd.concat([train_features_df1,train_features_df2],axis =1)
train_features_set.shape

(532279, 23)

In [23]:
train_features_set.head(10)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10059678,4,1,0,9,3,3.098285,22,22,0,...,1,2,2,1,2,3,609,0,4,2
1,10081540,3,1,1,11,3,1.0,50,50,0,...,0,3,2,0,0,2,564,0,3,1
2,10079689,2,1,0,11,3,4.0,63,63,0,...,1,2,2,1,2,2,648,0,3,2
3,10003257,6,1,0,7,4,2.0,24,24,0,...,0,1,2,0,3,3,586,0,4,2
4,10003791,2,1,0,6,4,3.085515,20,20,0,...,0,2,2,0,2,2,658,0,4,1
5,10049833,6,2,1,11,5,3.002057,63,60,1,...,0,3,3,0,2,1,564,0,2,1
6,10029327,5,1,0,24,3,3.061414,22,22,0,...,0,1,2,0,0,1,587,0,4,1
7,10100240,6,2,1,13,6,1.0,51,50,1,...,0,4,3,0,0,3,578,0,2,2
8,10040061,6,1,1,8,4,1.0,68,68,0,...,1,1,3,0,2,3,569,0,4,3
9,10118715,6,1,0,11,6,3.07013,22,22,0,...,0,3,2,0,1,1,622,0,4,2


In [24]:
#Creating intermediate test features
test_features_df1=pd.DataFrame(np.array(test_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
test_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10006374,4
1,10054873,2
2,10114914,1
3,10072048,7
4,10052772,5


In [25]:
test_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost', 'weekend_indicator', 'accident_risk', 'time_factor'],
      dtype='object')

In [26]:
# creating intermediate dataframe which includes feature set of other required columns
test_features_df2=test_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
test_features_df2['car_value']=test_features_df2['car_value'].astype('category')

In [27]:
#converting car value from categorical to numeric
test_features_df2['car_value']=test_features_df2['car_value'].cat.codes

In [28]:
# doing the same procedure as above for the test set
test_features_set=pd.concat([test_features_df1,test_features_df2],axis=1)
test_features_set.shape

(133070, 23)

In [29]:
test_features_set.head(15)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10006374,4,1,1,17,3,2.0,61,61,0,...,0,3,3,0,0,3,584,0,3,2
1,10054873,2,1,1,9,6,1.0,63,60,1,...,0,3,3,0,0,2,640,0,2,2
2,10114914,1,1,0,13,4,3.098285,26,26,0,...,1,3,3,1,2,2,624,0,3,2
3,10072048,7,1,1,3,5,2.0,63,63,0,...,1,3,2,1,1,3,650,0,3,2
4,10052772,5,1,0,9,3,4.0,31,31,0,...,1,1,2,1,2,3,659,0,3,1
5,10073877,4,1,1,8,5,3.0,33,33,0,...,1,1,3,1,2,3,657,0,3,2
6,10127467,7,1,1,18,4,3.0,72,72,0,...,1,1,3,0,2,3,588,0,4,1
7,10011154,1,1,1,10,5,1.0,32,32,0,...,0,1,2,1,2,3,611,0,3,2
8,10047724,5,1,0,9,4,3.07013,41,41,0,...,0,3,3,1,1,2,610,0,3,1
9,10092621,5,1,0,11,6,4.0,53,53,0,...,1,2,2,1,2,2,704,0,3,2


## Modelling the data to predict the response using Logistic Regression

### Basic Steps to create a model is,
1. Data collection.
2. Data preprocessing:
    1) Data Cleansing.
    2) Data transformation (if required).
    3) Divide data into training and testing sets.
3. Build a model on training data.
4. Evaluate the model on the test data

### Step 1 and  Step 2 are done as shown above

In [30]:
Logistic_Regression_classfifer=LogisticRegression(class_weight=balanced)

### Step 3
### b) Fit the model

In [31]:
Logistic_Regression_classfifer.fit(train_features_set,response_vector.values.ravel())

LogisticRegression(C=1.0, class_weight={0: 7, 1: 1}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [32]:
classfier_score=Logistic_Regression_classfifer.score(train_features_set,response_vector.values.ravel())
classfier_score

0.85423809693788411

In [33]:
#test_features_set.sort_values('customer_ID')

In [34]:
predict_purchase=Logistic_Regression_classfifer.predict(test_features_set)
ctr=Logistic_Regression_classfifer.predict(test_features_set).sum()
ctr

0

In [35]:
customer_information = pd.DataFrame(columns=['customer_ID', 'record_type'])
policy_options=test_df1.ix[:,17:24]
customer_information['customer_ID'] = test_df1.customer_ID
customer_information['record_type'] = predict_purchase.astype(int)
predicted_output=pd.concat([customer_information,policy_options],axis=1)
predicted_output.to_csv('logisticregression.csv',index=False)


In [36]:
test_customer_information=pd.DataFrame(columns=['customer_ID', 'record_type'])
test_customer_information['customer_ID'] = test_df1.customer_ID
test_customer_information['record_type']=test_df1.record_type
expected_output=pd.concat([test_customer_information,test_df1.ix[:,17:24]], axis=1)
expected_output.to_csv('expectedOutput1.csv', index=False)

In [37]:
lr=pd.read_csv('logisticregression.csv')
lr.head()

Unnamed: 0,customer_ID,record_type,A,B,C,D,E,F,G
0,10006374,0,0,0,3,3,0,0,3
1,10054873,0,0,0,3,3,0,0,2
2,10114914,0,1,1,3,3,1,2,2
3,10072048,0,1,1,3,2,1,1,3
4,10052772,0,1,1,1,2,1,2,3


In [38]:
exp=pd.read_csv('expectedOutput1.csv')
exp.loc[exp['record_type']==1].shape

(19423, 9)

In [39]:
lr.loc[exp['record_type']==0].shape

(113647, 9)

In [40]:
expected=test_df1.record_type


In [41]:
print(scipy.stats.pearsonr(dta.risk_factor,dta.record_type))

(-9.5470741647539611e-05, 0.937928031340581)


In [42]:
print(scipy.stats.pearsonr(dta.married_couple,dta.record_type))

(0.0019397044832233314, 0.11360513737685342)


In [43]:
print(scipy.stats.pearsonr(dta.homeowner,dta.record_type))

(0.0076978090904136912, 3.405350765789396e-10)


In [44]:
print(scipy.stats.pearsonr(dta.homeowner,dta.risk_factor))

(-0.17669409509101919, 0.0)


In [45]:
print(recall_score(expected,lr.record_type))

0.0


In [46]:

print (classification_report(expected, lr.record_type))


             precision    recall  f1-score   support

          0       0.85      1.00      0.92    113647
          1       0.00      0.00      0.00     19423

avg / total       0.73      0.85      0.79    133070



  'precision', 'predicted', average, warn_for)


In [47]:
print(accuracy_score(expected,lr.record_type))

0.854039227474


In [48]:
print (roc_curve(expected, lr.record_type,pos_label=1))

(array([ 0.,  1.]), array([ 0.,  1.]), array([1, 0], dtype=int64))
