In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import recall_score
from matplotlib import pyplot as plt

## Step 1. Data Collection

In [2]:
# Reading dataset into dta
dta = pd.read_csv('original_training_set/train.csv')

## Step 2. Splitting data into training set and testing set

In [3]:
train, test = train_test_split(dta, test_size = 0.2)
train.to_csv('new_train_data.csv')
test.to_csv('new_test_data.csv')

In [4]:
dta.isnull().values.sum()

0

## Data of both training set and testing set

In [5]:
train_df1=pd.read_csv('new_train_data.csv')
train_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,175434,10039973,1,0,2,09:39,TN,14585,1,1,...,2,1,1,1,3,2,1,2,2,601
1,281914,10064325,5,0,5,10:46,OH,12026,1,0,...,3,1,1,1,3,3,0,2,3,682
2,157390,10035865,2,0,4,15:16,OH,12605,1,0,...,3,4,1,1,2,3,1,1,2,624
3,118603,10026422,1,0,4,10:32,NH,11453,1,1,...,3,12,1,0,3,3,0,1,1,608
4,148576,10033824,6,0,0,13:46,NY,10703,2,1,...,3,2,1,1,3,3,1,0,1,729


In [6]:
test_df1=pd.read_csv('new_test_data.csv')
test_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,79512,10018317,1,0,3,09:28,NY,13605,1,1,...,3,4,1,0,3,3,0,0,3,614
1,86142,10017241,4,0,1,14:14,CO,11039,1,0,...,4,2,1,1,3,3,0,1,2,594
2,90733,10019880,10,1,2,12:48,GA,12225,1,0,...,1,3,1,1,2,3,1,2,4,701
3,248394,10054702,4,0,2,15:08,KY,13404,1,1,...,2,6,2,1,2,2,1,2,1,680
4,335689,10076449,4,0,4,10:36,OH,14989,1,1,...,3,2,1,0,3,3,0,1,3,616


## Pre-processing data

### Removing the Unnamed column from the new training set

In [7]:
train_df1.shape

(532199, 26)

In [8]:
train_df1.drop(train_df1.columns[0],axis=1, inplace=True)

In [9]:
train_df1.shape

(532199, 25)

In [10]:
train_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10039973,1,0,2,09:39,TN,14585,1,1,14,...,2,1,1,1,3,2,1,2,2,601
1,10064325,5,0,5,10:46,OH,12026,1,0,1,...,3,1,1,1,3,3,0,2,3,682
2,10035865,2,0,4,15:16,OH,12605,1,0,7,...,3,4,1,1,2,3,1,1,2,624
3,10026422,1,0,4,10:32,NH,11453,1,1,6,...,3,12,1,0,3,3,0,1,1,608
4,10033824,6,0,0,13:46,NY,10703,2,1,12,...,3,2,1,1,3,3,1,0,1,729


In [11]:
test_df1.shape

(133050, 26)

In [12]:
test_df1.drop(test_df1.columns[0],axis=1, inplace=True)

In [13]:
test_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10018317,1,0,3,09:28,NY,13605,1,1,6,...,3,4,1,0,3,3,0,0,3,614
1,10017241,4,0,1,14:14,CO,11039,1,0,12,...,4,2,1,1,3,3,0,1,2,594
2,10019880,10,1,2,12:48,GA,12225,1,0,6,...,1,3,1,1,2,3,1,2,4,701
3,10054702,4,0,2,15:08,KY,13404,1,1,14,...,2,6,2,1,2,2,1,2,1,680
4,10076449,4,0,4,10:36,OH,14989,1,1,1,...,3,2,1,0,3,3,0,1,3,616


### Creating response vector and feature set

In [14]:
test_df1.isnull().values.sum()

0

In [15]:
train_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost'],
      dtype='object')

In [16]:
# creating intermediate dataframe which includes feature set of customer_ID and shopping_pt
train_features_df1=pd.DataFrame(np.array(train_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
train_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10039973,1
1,10064325,5
2,10035865,2
3,10026422,1
4,10033824,6


In [17]:
# creating intermediate dataframe which includes feature set of other required columns

train_features_df2=train_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
train_features_df2['car_value']=train_features_df2['car_value'].astype('category')

In [18]:
# converting categorical values to numeric values which will be used for prediction
train_features_df2['car_value']=train_features_df2['car_value'].cat.codes

In [19]:
train_features_df2['car_value'].head()

0    5
1    5
2    4
3    6
4    4
Name: car_value, dtype: int8

In [20]:
train_features_df2.head()

Unnamed: 0,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,1,1,14,5,3,46,46,0,2,1,1,1,3,2,1,2,2,601
1,1,0,1,5,1,21,21,0,3,1,1,1,3,3,0,2,3,682
2,1,0,7,4,1,29,29,0,3,4,1,1,2,3,1,1,2,624
3,1,1,6,6,1,28,28,0,3,12,1,0,3,3,0,1,1,608
4,2,1,12,4,3,75,65,1,3,2,1,1,3,3,1,0,1,729


In [21]:
# creating response vector to train our model
response_vector=pd.DataFrame(np.array(train_df1['record_type']),columns=['record_type'])
#response_vector.loc[response_vector['record_type']==1]
response_vector.head(30)

Unnamed: 0,record_type
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [22]:
# creating final feature set dataframe
train_features_set=pd.concat([train_features_df1,train_features_df2],axis =1)
train_features_set.shape

(532199, 20)

In [23]:
train_features_set.head(10)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10039973,1,1,1,14,5,3,46,46,0,2,1,1,1,3,2,1,2,2,601
1,10064325,5,1,0,1,5,1,21,21,0,3,1,1,1,3,3,0,2,3,682
2,10035865,2,1,0,7,4,1,29,29,0,3,4,1,1,2,3,1,1,2,624
3,10026422,1,1,1,6,6,1,28,28,0,3,12,1,0,3,3,0,1,1,608
4,10033824,6,2,1,12,4,3,75,65,1,3,2,1,1,3,3,1,0,1,729
5,10073184,5,1,1,4,4,1,75,75,0,3,5,1,1,3,3,1,0,2,638
6,10017559,3,1,1,2,4,2,45,45,0,1,15,1,0,1,3,1,2,3,630
7,10137073,4,1,0,8,7,1,40,40,0,1,0,1,1,1,2,1,2,1,723
8,10013720,2,1,0,6,7,2,49,49,0,1,12,1,1,1,2,1,2,1,653
9,10138340,4,1,1,3,3,2,24,24,0,4,9,1,1,3,3,1,2,1,647


In [24]:
#Creating intermediate test features
test_features_df1=pd.DataFrame(np.array(test_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
test_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10018317,1
1,10017241,4
2,10019880,10
3,10054702,4
4,10076449,4


In [25]:
test_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost'],
      dtype='object')

In [26]:
# creating intermediate dataframe which includes feature set of other required columns
test_features_df2=test_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
test_features_df2['car_value']=test_features_df2['car_value'].astype('category')

In [27]:
#converting car value from categorical to numeric
test_features_df2['car_value']=test_features_df2['car_value'].cat.codes

In [28]:
# doing the same procedure as above for the test set
test_features_set=pd.concat([test_features_df1,test_features_df2],axis=1)
test_features_set.shape

(133050, 20)

In [29]:
test_features_set.head(15)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,C_previous,duration_previous,A,B,C,D,E,F,G,cost
0,10018317,1,1,1,6,4,2,65,65,0,3,4,1,0,3,3,0,0,3,614
1,10017241,4,1,0,12,5,4,33,33,0,4,2,1,1,3,3,0,1,2,594
2,10019880,10,1,0,6,5,4,28,28,0,1,3,1,1,2,3,1,2,4,701
3,10054702,4,1,1,14,4,1,50,50,0,2,6,2,1,2,2,1,2,1,680
4,10076449,4,1,1,1,4,4,43,43,0,3,2,1,0,3,3,0,1,3,616
5,10087584,4,1,1,0,5,3,58,58,0,1,1,1,1,1,2,1,2,2,706
6,10042931,3,2,1,16,5,1,68,66,1,3,15,1,1,3,3,0,2,2,594
7,10129609,2,2,1,10,5,1,50,48,1,3,9,1,0,3,3,1,1,3,596
8,10107787,1,1,1,1,4,1,58,58,0,1,4,1,1,2,3,1,2,3,600
9,10140600,8,1,0,14,5,1,38,38,0,4,1,0,1,3,3,0,0,1,559


## Modelling the data to predict the response using Logistic Regression

### Basic Steps to create a model is,
1. Data collection.
2. Data preprocessing:
    1) Data Cleansing.
    2) Data transformation (if required).
    3) Divide data into training and testing sets.
3. Build a model on training data.
4. Evaluate the model on the test data

### Step 1 and  Step 2 are done as shown above

In [30]:
Logistic_Regression_classfifer=LogisticRegression(C=1,class_weight='balanced')

### Step 3
### b) Fit the model

In [31]:
Logistic_Regression_classfifer.fit(train_features_set,response_vector.values.ravel())

LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [32]:
classfier_score=Logistic_Regression_classfifer.score(train_features_set,response_vector.values.ravel())
classfier_score

0.76296460534499311

In [33]:
#test_features_set.sort_values('customer_ID')

In [34]:
predict_purchase=Logistic_Regression_classfifer.predict(test_features_set)

In [35]:
customer_information = pd.DataFrame(columns=['customer_ID', 'record_type'])
policy_options=test_df1.ix[:,17:24]
customer_information['customer_ID'] = test_df1.customer_ID
customer_information['record_type'] = predict_purchase.astype(int)
predicted_output=pd.concat([customer_information,policy_options],axis=1)
predicted_output.to_csv('logisticregression.csv',index=False)


In [36]:
test_customer_information=pd.DataFrame(columns=['customer_ID', 'record_type'])
test_customer_information['customer_ID'] = test_df1.customer_ID
test_customer_information['record_type']=test_df1.record_type
expected_output=pd.concat([test_customer_information,test_df1.ix[:,17:24]], axis=1)
expected_output.to_csv('expectedOutput1.csv', index=False)

In [37]:
lr=pd.read_csv('logisticregression.csv')
lr.head()

Unnamed: 0,customer_ID,record_type,A,B,C,D,E,F,G
0,10018317,0,1,0,3,3,0,0,3
1,10017241,0,1,1,3,3,0,1,2
2,10019880,1,1,1,2,3,1,2,4
3,10054702,0,2,1,2,2,1,2,1
4,10076449,0,1,0,3,3,0,1,3


In [38]:
exp=pd.read_csv('expectedOutput1.csv')
exp.loc[exp['record_type']==1].shape

(19229, 9)

In [39]:
lr.loc[exp['record_type']==0].shape

(113821, 9)

In [40]:
expected=test_df1.record_type


In [41]:

print (classification_report(expected, lr.record_type))


             precision    recall  f1-score   support

          0       0.95      0.76      0.84    113821
          1       0.35      0.76      0.48     19229

avg / total       0.86      0.76      0.79    133050

