In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from matplotlib import pyplot as plt
import scipy

## Step 1. Data Collection

In [2]:
# Reading dataset into dta
dta = pd.read_csv('original_training_set/train_balanced.csv')

## Step 2. Splitting data into training set and testing set

In [3]:
train, test = train_test_split(dta, test_size = 0.2)
train.to_csv('new_train_data.csv')
test.to_csv('new_test_data.csv')

In [4]:
dta.isnull().values.sum()

0

## Data of both training set and testing set

In [5]:
train_df1=pd.read_csv('new_train_data.csv')
train_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,270618,10062352,1,0,4,12:59,FL,13859,1,0,...,0,3,1,0,3,3,706,0,4,2
1,101752,10023584,2,0,0,10:01,FL,11824,1,1,...,0,2,3,1,1,3,611,0,3,1
2,541234,10124398,5,0,3,11:04,NY,12476,1,1,...,1,3,3,1,0,2,710,0,4,1
3,509016,10116964,1,0,0,14:06,MS,14593,1,0,...,0,1,1,0,0,4,639,0,4,2
4,355860,10081823,1,0,1,11:32,NY,14715,1,0,...,1,3,3,1,0,2,694,0,3,1


In [6]:
test_df1=pd.read_csv('new_test_data.csv')
test_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,82659,10019166,2,0,0,09:37,OH,10450,1,0,...,1,1,1,0,0,3,559,0,4,1
1,608490,10139813,5,0,3,16:07,MO,12103,2,1,...,1,3,3,0,2,4,607,0,2,3
2,72046,10016685,4,0,2,11:30,FL,12632,1,1,...,0,2,3,1,1,3,650,0,4,1
3,623083,10143222,3,0,1,15:52,MD,14848,1,1,...,0,3,2,0,0,2,628,0,3,2
4,227298,10052574,5,0,2,15:03,FL,13929,1,1,...,1,3,3,1,2,3,643,0,3,2


## Pre-processing data

### Removing the Unnamed column from the new training set

In [7]:
train_df1.shape

(532279, 29)

In [8]:
train_df1.drop(train_df1.columns[0],axis=1, inplace=True)

In [9]:
train_df1.shape

(532279, 28)

In [10]:
train_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10062352,1,0,4,12:59,FL,13859,1,0,1,...,0,3,1,0,3,3,706,0,4,2
1,10023584,2,0,0,10:01,FL,11824,1,1,6,...,0,2,3,1,1,3,611,0,3,1
2,10124398,5,0,3,11:04,NY,12476,1,1,8,...,1,3,3,1,0,2,710,0,4,1
3,10116964,1,0,0,14:06,MS,14593,1,0,1,...,0,1,1,0,0,4,639,0,4,2
4,10081823,1,0,1,11:32,NY,14715,1,0,14,...,1,3,3,1,0,2,694,0,3,1


In [11]:
test_df1.shape

(133070, 29)

In [12]:
test_df1.drop(test_df1.columns[0],axis=1, inplace=True)

In [13]:
test_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10019166,2,0,0,09:37,OH,10450,1,0,18,...,1,1,1,0,0,3,559,0,4,1
1,10139813,5,0,3,16:07,MO,12103,2,1,5,...,1,3,3,0,2,4,607,0,2,3
2,10016685,4,0,2,11:30,FL,12632,1,1,11,...,0,2,3,1,1,3,650,0,4,1
3,10143222,3,0,1,15:52,MD,14848,1,1,16,...,0,3,2,0,0,2,628,0,3,2
4,10052574,5,0,2,15:03,FL,13929,1,1,7,...,1,3,3,1,2,3,643,0,3,2


### Creating response vector and feature set

In [14]:
test_df1.isnull().values.sum()

0

In [15]:
train_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost', 'weekend_indicator', 'accident_risk', 'time_factor'],
      dtype='object')

In [16]:
# creating intermediate dataframe which includes feature set of customer_ID and shopping_pt
train_features_df1=pd.DataFrame(np.array(train_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
train_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10062352,1
1,10023584,2
2,10124398,5
3,10116964,1
4,10081823,1


In [17]:
# creating intermediate dataframe which includes feature set of other required columns

train_features_df2=train_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
train_features_df2['car_value']=train_features_df2['car_value'].astype('category')

In [18]:
# converting categorical values to numeric values which will be used for prediction
train_features_df2['car_value']=train_features_df2['car_value'].cat.codes

In [19]:
train_features_df2['car_value'].head()

0    4
1    4
2    5
3    5
4    5
Name: car_value, dtype: int8

In [20]:
train_features_df2.head()

Unnamed: 0,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,C_previous,duration_previous,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,1,0,1,4,3.0,23,23,0,3,1,...,0,3,1,0,3,3,706,0,4,2
1,1,1,6,4,1.0,55,55,0,2,4,...,0,2,3,1,1,3,611,0,3,1
2,1,1,8,5,1.0,66,66,0,3,4,...,1,3,3,1,0,2,710,0,4,1
3,1,0,1,5,3.098648,75,75,0,1,0,...,0,1,1,0,0,4,639,0,4,2
4,1,0,14,5,4.0,65,65,0,2,1,...,1,3,3,1,0,2,694,0,3,1


In [21]:
# creating response vector to train our model
response_vector=pd.DataFrame(np.array(train_df1['record_type']),columns=['record_type'])
#response_vector.loc[response_vector['record_type']==1]
response_vector.head(30)

Unnamed: 0,record_type
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [22]:
# creating final feature set dataframe
train_features_set=pd.concat([train_features_df1,train_features_df2],axis =1)
train_features_set.shape

(532279, 23)

In [23]:
train_features_set.head(10)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10062352,1,1,0,1,4,3.0,23,23,0,...,0,3,1,0,3,3,706,0,4,2
1,10023584,2,1,1,6,4,1.0,55,55,0,...,0,2,3,1,1,3,611,0,3,1
2,10124398,5,1,1,8,5,1.0,66,66,0,...,1,3,3,1,0,2,710,0,4,1
3,10116964,1,1,0,1,5,3.098648,75,75,0,...,0,1,1,0,0,4,639,0,4,2
4,10081823,1,1,0,14,5,4.0,65,65,0,...,1,3,3,1,0,2,694,0,3,1
5,10061912,1,1,0,2,5,1.0,73,73,0,...,0,3,2,1,2,1,624,0,4,1
6,10068528,3,1,0,7,3,1.0,43,43,0,...,1,2,3,1,1,3,619,0,3,3
7,10053432,8,1,1,11,4,4.0,42,42,0,...,0,1,2,0,0,2,568,0,3,3
8,10098220,5,1,0,7,4,4.0,60,60,0,...,1,3,2,0,1,2,618,0,3,3
9,10067109,3,1,1,14,6,3.07013,36,36,0,...,0,1,3,1,2,1,692,0,3,1


In [24]:
#Creating intermediate test features
test_features_df1=pd.DataFrame(np.array(test_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
test_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10019166,2
1,10139813,5
2,10016685,4
3,10143222,3
4,10052574,5


In [25]:
test_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost', 'weekend_indicator', 'accident_risk', 'time_factor'],
      dtype='object')

In [26]:
# creating intermediate dataframe which includes feature set of other required columns
test_features_df2=test_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
test_features_df2['car_value']=test_features_df2['car_value'].astype('category')

In [27]:
#converting car value from categorical to numeric
test_features_df2['car_value']=test_features_df2['car_value'].cat.codes

In [28]:
# doing the same procedure as above for the test set
test_features_set=pd.concat([test_features_df1,test_features_df2],axis=1)
test_features_set.shape

(133070, 23)

In [29]:
test_features_set.head(15)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10019166,2,1,0,18,3,3.099142,69,69,0,...,1,1,1,0,0,3,559,0,4,1
1,10139813,5,2,1,5,7,3.0,37,35,1,...,1,3,3,0,2,4,607,0,2,3
2,10016685,4,1,1,11,4,2.0,71,71,0,...,0,2,3,1,1,3,650,0,4,1
3,10143222,3,1,1,16,5,2.0,27,27,0,...,0,3,2,0,0,2,628,0,3,2
4,10052574,5,1,1,7,5,1.0,48,48,0,...,1,3,3,1,2,3,643,0,3,2
5,10091113,5,1,0,3,6,1.0,55,55,0,...,0,2,3,1,0,3,666,0,3,2
6,10053447,4,1,0,5,3,1.0,30,30,0,...,1,2,3,0,2,4,645,0,3,1
7,10014135,9,1,1,11,5,4.0,34,34,0,...,1,1,2,1,0,1,720,0,3,2
8,10089316,2,1,0,10,5,3.111921,32,32,0,...,1,1,3,0,0,2,648,0,3,3
9,10083536,6,2,1,2,6,3.0,54,50,1,...,0,4,3,1,1,4,611,0,2,1


## Modelling the data to predict the response using Logistic Regression

### Basic Steps to create a model is,
1. Data collection.
2. Data preprocessing:
    1) Data Cleansing.
    2) Data transformation (if required).
    3) Divide data into training and testing sets.
3. Build a model on training data.
4. Evaluate the model on the test data

### Step 1 and  Step 2 are done as shown above

In [30]:
clf=RidgeClassifier(class_weight='balanced',normalize='True')

for i in ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag']:
    clf=RidgeClassifier(class_weight='balanced',normalize='True', solver=i)
    clf.fit(train_features_set,response_vector.values.ravel())
    predict_purchase=clf.predict(test_features_set)
    print (classification_report(expected, predict_purchase))
    

### Step 3
### b) Fit the model

In [31]:
clf.fit(train_features_set,response_vector.values.ravel())

RidgeClassifier(alpha=1.0, class_weight='balanced', copy_X=True,
        fit_intercept=True, max_iter=None, normalize='True',
        random_state=None, solver='auto', tol=0.001)

In [32]:
classfier_score=clf.score(train_features_set,response_vector.values.ravel())
classfier_score

0.75830532483904123

In [33]:
#test_features_set.sort_values('customer_ID')

In [34]:
predict_purchase=clf.predict(test_features_set)
ctr=clf.predict(test_features_set).sum()
ctr

42317

In [35]:
customer_information = pd.DataFrame(columns=['customer_ID', 'record_type'])
policy_options=test_df1.ix[:,17:24]
customer_information['customer_ID'] = test_df1.customer_ID
customer_information['record_type'] = predict_purchase.astype(int)
predicted_output=pd.concat([customer_information,policy_options],axis=1)
predicted_output.to_csv('logisticregression.csv',index=False)


In [36]:
test_customer_information=pd.DataFrame(columns=['customer_ID', 'record_type'])
test_customer_information['customer_ID'] = test_df1.customer_ID
test_customer_information['record_type']=test_df1.record_type
expected_output=pd.concat([test_customer_information,test_df1.ix[:,17:24]], axis=1)
expected_output.to_csv('expectedOutput1.csv', index=False)

In [37]:
lr=pd.read_csv('logisticregression.csv')
lr.head()

Unnamed: 0,customer_ID,record_type,A,B,C,D,E,F,G
0,10019166,0,0,1,1,1,0,0,3
1,10139813,1,1,1,3,3,0,2,4
2,10016685,0,1,0,2,3,1,1,3
3,10143222,0,0,0,3,2,0,0,2
4,10052574,0,1,1,3,3,1,2,3


In [38]:
exp=pd.read_csv('expectedOutput1.csv')
exp.loc[exp['record_type']==1].shape

(19548, 9)

In [39]:
lr.loc[exp['record_type']==0].shape

(113522, 9)

In [40]:
expected=test_df1.record_type


In [41]:
print(scipy.stats.pearsonr(dta.risk_factor,dta.record_type))

(-9.5470741647561065e-05, 0.937928031340581)


In [42]:
print(scipy.stats.pearsonr(dta.married_couple,dta.record_type))

(0.0019397044832233301, 0.11360513737685342)


In [43]:
print(scipy.stats.pearsonr(dta.homeowner,dta.record_type))

(0.0076978090904136964, 3.405350765789396e-10)


In [44]:
print(scipy.stats.pearsonr(dta.homeowner,dta.risk_factor))

(-0.17669409509101933, 0.0)


In [45]:
print(recall_score(expected,lr.record_type))

0.762942500512


In [46]:

print (classification_report(expected, lr.record_type))


             precision    recall  f1-score   support

          0       0.95      0.76      0.84    113522
          1       0.35      0.76      0.48     19548

avg / total       0.86      0.76      0.79    133070

