In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.cross_validation import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from matplotlib import pyplot as plt
import scipy

## Step 1. Data Collection

In [2]:
# Reading dataset into dta
dta = pd.read_csv('original_training_set/train_manipulated.csv')
dta.shape

(665349, 28)

## Step 2. Splitting data into training set and testing set

In [3]:
train, test = train_test_split(dta, test_size = 0.2)
train.to_csv('new_train_data.csv')
test.to_csv('new_test_data.csv')

In [4]:
dta.isnull().values.sum()

0

## Data of both training set and testing set

In [5]:
train_df1=pd.read_csv('new_train_data.csv')
train_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,389021,10052231,6,1,1,12:56,OH,11175,1,1,...,1,2,2,1,2,2,640,0,3,2
1,390739,10052878,5,0,0,10:37,NY,14397,1,0,...,0,3,1,0,0,1,668,0,4,1
2,118161,10109185,5,0,2,10:59,OH,13634,1,1,...,0,3,3,0,1,3,615,0,3,1
3,547961,10110103,1,0,3,13:23,NM,12219,1,0,...,0,1,1,0,0,4,560,0,3,2
4,410793,10060011,6,0,0,11:39,NY,11797,3,0,...,1,2,3,0,0,3,688,0,4,1


In [6]:
test_df1=pd.read_csv('new_test_data.csv')
test_df1.head()

Unnamed: 0.1,Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,213915,10007330,1,0,1,14:07,TN,11820,1,0,...,1,1,1,1,3,4,666,0,3,2
1,657953,10150119,10,1,1,12:03,PA,13018,1,0,...,0,4,3,1,1,4,673,0,3,2
2,591560,10126079,6,0,0,12:24,OH,14567,1,1,...,1,2,3,1,1,2,595,0,3,2
3,228110,10137889,8,1,3,12:55,WA,10770,2,0,...,1,1,1,0,2,2,659,0,2,2
4,159698,10122864,2,0,2,10:10,PA,10278,1,0,...,1,3,3,1,2,3,656,0,3,1


## Pre-processing data

### Removing the Unnamed column from the new training set

In [7]:
train_df1.shape

(532279, 29)

In [8]:
train_df1.drop(train_df1.columns[0],axis=1, inplace=True)

In [9]:
train_df1.shape

(532279, 28)

In [10]:
train_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10052231,6,1,1,12:56,OH,11175,1,1,1,...,1,2,2,1,2,2,640,0,3,2
1,10052878,5,0,0,10:37,NY,14397,1,0,6,...,0,3,1,0,0,1,668,0,4,1
2,10109185,5,0,2,10:59,OH,13634,1,1,6,...,0,3,3,0,1,3,615,0,3,1
3,10110103,1,0,3,13:23,NM,12219,1,0,0,...,0,1,1,0,0,4,560,0,3,2
4,10060011,6,0,0,11:39,NY,11797,3,0,9,...,1,2,3,0,0,3,688,0,4,1


In [11]:
test_df1.shape

(133070, 29)

In [12]:
test_df1.drop(test_df1.columns[0],axis=1, inplace=True)

In [13]:
test_df1.head()

Unnamed: 0,customer_ID,shopping_pt,record_type,day,time,state,location,group_size,homeowner,car_age,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10007330,1,0,1,14:07,TN,11820,1,0,2,...,1,1,1,1,3,4,666,0,3,2
1,10150119,10,1,1,12:03,PA,13018,1,0,2,...,0,4,3,1,1,4,673,0,3,2
2,10126079,6,0,0,12:24,OH,14567,1,1,9,...,1,2,3,1,1,2,595,0,3,2
3,10137889,8,1,3,12:55,WA,10770,2,0,4,...,1,1,1,0,2,2,659,0,2,2
4,10122864,2,0,2,10:10,PA,10278,1,0,6,...,1,3,3,1,2,3,656,0,3,1


### Creating response vector and feature set

In [14]:
test_df1.isnull().values.sum()

0

In [15]:
train_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost', 'weekend_indicator', 'accident_risk', 'time_factor'],
      dtype='object')

In [16]:
# creating intermediate dataframe which includes feature set of customer_ID and shopping_pt
train_features_df1=pd.DataFrame(np.array(train_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
train_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10052231,6
1,10052878,5
2,10109185,5
3,10110103,1
4,10060011,6


In [17]:
# creating intermediate dataframe which includes feature set of other required columns

train_features_df2=train_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
train_features_df2['car_value']=train_features_df2['car_value'].astype('category')

In [18]:
# converting categorical values to numeric values which will be used for prediction
train_features_df2['car_value']=train_features_df2['car_value'].cat.codes

In [19]:
train_features_df2['car_value'].head()

0    4
1    3
2    4
3    5
4    5
Name: car_value, dtype: int8

In [20]:
train_features_df2.head()

Unnamed: 0,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,C_previous,duration_previous,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,1,1,1,4,4.0,55,55,0,2,5,...,1,2,2,1,2,2,640,0,3,2
1,1,0,6,3,2.0,75,75,0,3,2,...,0,3,1,0,0,1,668,0,4,1
2,1,1,6,4,3.099142,26,26,0,3,2,...,0,3,3,0,1,3,615,0,3,1
3,1,0,0,5,4.0,32,32,0,3,1,...,0,1,1,0,0,4,560,0,3,2
4,3,0,9,5,2.0,69,30,0,3,15,...,1,2,3,0,0,3,688,0,4,1


In [21]:
# creating response vector to train our model
response_vector=pd.DataFrame(np.array(train_df1['record_type']),columns=['record_type'])
#response_vector.loc[response_vector['record_type']==1]
response_vector.head(30)

Unnamed: 0,record_type
0,1
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,1


In [22]:
# creating final feature set dataframe
train_features_set=pd.concat([train_features_df1,train_features_df2],axis =1)
train_features_set.shape

(532279, 23)

In [23]:
train_features_set.head(10)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10052231,6,1,1,1,4,4.0,55,55,0,...,1,2,2,1,2,2,640,0,3,2
1,10052878,5,1,0,6,3,2.0,75,75,0,...,0,3,1,0,0,1,668,0,4,1
2,10109185,5,1,1,6,4,3.099142,26,26,0,...,0,3,3,0,1,3,615,0,3,1
3,10110103,1,1,0,0,5,4.0,32,32,0,...,0,1,1,0,0,4,560,0,3,2
4,10060011,6,3,0,9,5,2.0,69,30,0,...,1,2,3,0,0,3,688,0,4,1
5,10136319,4,2,0,1,4,3.088438,49,46,0,...,1,2,2,1,2,2,673,0,3,3
6,10056494,4,2,1,9,7,3.153409,30,27,1,...,1,3,2,1,3,2,604,0,2,3
7,10109486,1,2,1,2,5,3.020041,43,43,1,...,0,1,1,0,3,1,633,0,2,2
8,10007129,6,1,1,6,4,4.0,52,52,0,...,0,2,2,0,2,1,632,0,3,1
9,10101540,7,1,1,10,6,3.0,44,44,0,...,0,1,2,0,2,3,576,0,3,2


In [24]:
#Creating intermediate test features
test_features_df1=pd.DataFrame(np.array(test_df1[['customer_ID','shopping_pt']]) , columns=['customer_ID','shopping_pt'] )
test_features_df1.head()

Unnamed: 0,customer_ID,shopping_pt
0,10007330,1
1,10150119,10
2,10126079,6
3,10137889,8
4,10122864,2


In [25]:
test_df1.columns

Index(['customer_ID', 'shopping_pt', 'record_type', 'day', 'time', 'state',
       'location', 'group_size', 'homeowner', 'car_age', 'car_value',
       'risk_factor', 'age_oldest', 'age_youngest', 'married_couple',
       'C_previous', 'duration_previous', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
       'cost', 'weekend_indicator', 'accident_risk', 'time_factor'],
      dtype='object')

In [26]:
# creating intermediate dataframe which includes feature set of other required columns
test_features_df2=test_df1.ix[:,7:]
# casting object type to categorical type to convert it later into numeric type
test_features_df2['car_value']=test_features_df2['car_value'].astype('category')

In [27]:
#converting car value from categorical to numeric
test_features_df2['car_value']=test_features_df2['car_value'].cat.codes

In [28]:
# doing the same procedure as above for the test set
test_features_set=pd.concat([test_features_df1,test_features_df2],axis=1)
test_features_set.shape

(133070, 23)

In [29]:
test_features_set.head(15)

Unnamed: 0,customer_ID,shopping_pt,group_size,homeowner,car_age,car_value,risk_factor,age_oldest,age_youngest,married_couple,...,B,C,D,E,F,G,cost,weekend_indicator,accident_risk,time_factor
0,10007330,1,1,0,2,5,3.10191,36,36,0,...,1,1,1,1,3,4,666,0,3,2
1,10150119,10,1,0,2,4,3.0,55,55,0,...,0,4,3,1,1,4,673,0,3,2
2,10126079,6,1,1,9,4,4.0,65,65,0,...,1,2,3,1,1,2,595,0,3,2
3,10137889,8,2,0,4,4,3.020041,40,33,1,...,1,1,1,0,2,2,659,0,2,2
4,10122864,2,1,0,6,5,3.07013,26,26,0,...,1,3,3,1,2,3,656,0,3,1
5,10016416,3,1,0,1,6,2.0,44,44,0,...,0,4,3,1,0,2,674,0,3,1
6,10130386,7,2,1,2,4,1.0,75,70,1,...,1,3,3,1,2,4,623,0,3,1
7,10039082,7,1,1,4,5,1.0,34,34,0,...,1,3,3,1,2,1,598,0,3,3
8,10044504,4,1,1,17,3,3.0,22,22,0,...,0,1,2,0,2,3,644,0,4,1
9,10042097,5,1,0,1,4,3.088438,22,22,0,...,0,2,2,0,3,4,670,0,4,2


In [32]:
b=a.fit(train_features_set,response_vector.values.ravel())

TypeError: Cannot clone object '        customer_ID  shopping_pt  record_type  day   time state  location  \
0          10052231            6            1    1  12:56    OH     11175   
1          10052878            5            0    0  10:37    NY     14397   
2          10109185            5            0    2  10:59    OH     13634   
3          10110103            1            0    3  13:23    NM     12219   
4          10060011            6            0    0  11:39    NY     11797   
5          10136319            4            0    4  17:14    OK     15252   
6          10056494            4            0    2  16:56    CO     13136   
7          10109486            1            0    2  15:13    WA     15692   
8          10007129            6            0    3  11:03    IN     11707   
9          10101540            7            1    0  12:50    NM     12663   
10         10019992            5            0    0  11:43    OH     10963   
11         10140243            3            0    0  13:10    MS     13274   
12         10151294            5            0    4  10:35    IN     12459   
13         10066655            7            1    2  14:51    NY     10016   
14         10148186            3            0    1  13:15    MD     11469   
15         10007766            3            0    2  08:49    MD     11324   
16         10068437            8            0    1  14:29    OH     13009   
17         10084137            3            0    2  14:32    IN     11070   
18         10071297            5            0    3  14:18    RI     13105   
19         10107997            4            0    4  16:05    NV     15006   
20         10045813            2            0    4  17:44    CO     12538   
21         10023270            1            0    3  10:38    NY     14050   
22         10096214            5            1    6  16:50    PA     10668   
23         10091235            2            0    2  14:02    OH     10041   
24         10122189            6            0    2  09:30    OH     10450   
25         10067333            1            0    1  13:48    OH     12518   
26         10111122            2            0    2  15:12    TN     11966   
27         10023088            5            0    2  14:43    AR     11817   
28         10055819            5            0    2  16:28    MO     10186   
29         10075712            2            0    0  09:03    NY     10751   
...             ...          ...          ...  ...    ...   ...       ...   
532249     10097450            3            0    2  13:28    CO     11703   
532250     10082238            2            0    4  14:34    PA     11799   
532251     10103554            3            0    3  10:49    KY     10146   
532252     10084836            6            1    4  12:27    NY     13786   
532253     10147884            1            0    4  16:37    CO     12071   
532254     10122238            4            0    3  08:33    AL     10719   
532255     10114898            3            0    2  10:59    OK     11376   
532256     10042019            3            0    1  15:47    KS     11134   
532257     10057907            6            0    1  11:59    FL     10799   
532258     10060382            8            0    2  08:36    OH     11298   
532259     10066265            1            0    4  11:10    OK     15844   
532260     10115869            8            1    1  14:48    UT     10808   
532261     10076650            1            0    2  16:35    NM     13245   
532262     10028614            4            0    1  11:38    FL     11683   
532263     10115599            3            0    2  09:36    AL     10867   
532264     10069224            6            0    2  13:02    PA     15491   
532265     10086811            1            0    2  13:34    NY     11329   
532266     10097502            2            0    4  12:30    MS     10075   
532267     10029887            5            0    1  11:55    PA     12568   
532268     10047932            1            0    2  15:29    CO     13224   
532269     10056835            4            0    0  14:42    IN     11030   
532270     10103123            1            0    4  14:29    NY     10676   
532271     10131230            5            0    3  09:54    MO     13281   
532272     10043448            3            0    2  11:07    MD     11723   
532273     10136737            3            0    0  08:58    FL     12606   
532274     10049021            1            0    0  10:10    NH     12044   
532275     10127603            5            0    3  13:09    FL     11586   
532276     10019375            1            0    0  09:42    PA     11346   
532277     10030164            1            0    2  12:06    UT     13419   
532278     10070069            6            0    1  10:14    NY     12127   

        group_size  homeowner  car_age     ...       B  C  D  E  F  G  cost  \
0                1          1        1     ...       1  2  2  1  2  2   640   
1                1          0        6     ...       0  3  1  0  0  1   668   
2                1          1        6     ...       0  3  3  0  1  3   615   
3                1          0        0     ...       0  1  1  0  0  4   560   
4                3          0        9     ...       1  2  3  0  0  3   688   
5                2          0        1     ...       1  2  2  1  2  2   673   
6                2          1        9     ...       1  3  2  1  3  2   604   
7                2          1        2     ...       0  1  1  0  3  1   633   
8                1          1        6     ...       0  2  2  0  2  1   632   
9                1          1       10     ...       0  1  2  0  2  3   576   
10               1          0        6     ...       0  3  3  0  2  3   594   
11               1          0        1     ...       1  1  1  1  2  2   657   
12               1          0       14     ...       0  1  2  0  0  2   556   
13               1          0        1     ...       1  3  3  1  0  3   737   
14               2          1        7     ...       1  2  3  0  2  3   607   
15               1          1       14     ...       1  2  2  1  1  2   722   
16               2          1        3     ...       1  3  3  1  1  3   625   
17               1          1        1     ...       0  3  3  0  1  1   567   
18               1          0       19     ...       0  1  1  0  0  2   626   
19               1          0       13     ...       0  4  3  1  1  3   661   
20               1          1       12     ...       0  2  2  0  1  2   570   
21               1          0        9     ...       0  2  3  0  0  2   703   
22               1          0        6     ...       1  3  3  1  1  3   611   
23               1          0        3     ...       1  3  3  1  2  3   660   
24               2          1        1     ...       1  3  3  1  1  3   630   
25               1          1       16     ...       0  3  3  0  3  2   589   
26               1          0       21     ...       1  3  3  0  0  2   608   
27               2          1        3     ...       0  3  2  0  2  1   641   
28               1          1        7     ...       1  3  3  0  1  2   655   
29               1          1        2     ...       0  4  3  0  0  2   580   
...            ...        ...      ...     ...      .. .. .. .. .. ..   ...   
532249           1          0       13     ...       1  1  3  0  0  1   573   
532250           1          0       12     ...       0  1  3  0  1  2   698   
532251           1          1       15     ...       1  1  1  1  2  1   576   
532252           1          1        3     ...       0  1  1  0  0  2   729   
532253           1          0       14     ...       1  3  3  0  2  4   635   
532254           1          1        6     ...       1  1  1  0  2  2   677   
532255           2          1        3     ...       0  2  2  0  1  2   679   
532256           1          0        1     ...       0  3  3  0  2  2   676   
532257           1          1        1     ...       0  3  3  1  2  3   635   
532258           1          1       10     ...       0  3  3  1  1  3   621   
532259           1          1       13     ...       0  1  1  0  0  4   604   
532260           1          0       12     ...       0  1  3  0  2  1   625   
532261           1          0        7     ...       1  3  3  1  1  1   659   
532262           2          1        1     ...       1  4  3  1  2  3   625   
532263           1          0       15     ...       0  2  3  0  0  1   580   
532264           1          0       14     ...       0  1  1  0  0  2   632   
532265           1          0       10     ...       0  3  3  0  0  2   652   
532266           1          1        6     ...       0  3  3  0  1  2   655   
532267           1          1       11     ...       0  3  3  0  1  3   596   
532268           1          0       15     ...       1  3  3  0  2  1   627   
532269           2          1       16     ...       0  2  2  0  0  2   581   
532270           1          0        4     ...       0  3  3  0  0  2   661   
532271           2          0        7     ...       1  2  2  1  2  2   647   
532272           2          1       16     ...       0  2  3  0  0  2   600   
532273           1          0       15     ...       1  1  3  1  2  3   690   
532274           1          0       11     ...       1  3  3  1  1  1   666   
532275           1          1        5     ...       1  4  3  1  3  3   609   
532276           1          1       17     ...       0  1  1  1  2  2   634   
532277           1          0        2     ...       0  2  2  0  0  3   533   
532278           3          1        8     ...       0  2  3  0  0  2   706   

        weekend_indicator  accident_risk  time_factor  
0                       0              3            2  
1                       0              4            1  
2                       0              3            1  
3                       0              3            2  
4                       0              4            1  
5                       0              3            3  
6                       0              2            3  
7                       0              2            2  
8                       0              3            1  
9                       0              3            2  
10                      0              3            1  
11                      0              3            2  
12                      0              3            1  
13                      0              3            2  
14                      0              2            2  
15                      0              3            1  
16                      0              2            2  
17                      0              3            2  
18                      0              3            2  
19                      0              3            3  
20                      0              3            3  
21                      0              2            1  
22                      1              3            3  
23                      0              3            2  
24                      0              2            1  
25                      0              4            2  
26                      0              3            2  
27                      0              2            2  
28                      0              3            3  
29                      0              3            1  
...                   ...            ...          ...  
532249                  0              3            2  
532250                  0              3            2  
532251                  0              3            1  
532252                  0              3            2  
532253                  0              3            3  
532254                  0              3            1  
532255                  0              3            1  
532256                  0              3            2  
532257                  0              4            1  
532258                  0              4            1  
532259                  0              3            1  
532260                  0              4            2  
532261                  0              3            3  
532262                  0              3            1  
532263                  0              4            1  
532264                  0              3            2  
532265                  0              3            2  
532266                  0              3            2  
532267                  0              3            1  
532268                  0              3            2  
532269                  0              2            2  
532270                  0              4            2  
532271                  0              3            1  
532272                  0              3            1  
532273                  0              3            1  
532274                  0              3            1  
532275                  0              4            2  
532276                  0              3            1  
532277                  0              4            2  
532278                  0              3            1  

[532279 rows x 28 columns]' (type <class 'pandas.core.frame.DataFrame'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' methods.

## Modelling the data to predict the response using Logistic Regression

### Basic Steps to create a model is,
1. Data collection.
2. Data preprocessing:
    1) Data Cleansing.
    2) Data transformation (if required).
    3) Divide data into training and testing sets.
3. Build a model on training data.
4. Evaluate the model on the test data

### Step 1 and  Step 2 are done as shown above

In [33]:
Logistic_Regression_classfifer=LogisticRegression(class_weight='balanced')

In [35]:
x=Logistic_Regression_classfifer.fit(train_features_set,response_vector.values.ravel())

In [38]:
a=SelectFromModel(x, prefit=True)

In [40]:
b=a.transform(train_features_set)
b.shape

(532279, 5)

### Step 3
### b) Fit the model

In [None]:
classfier_score=Logistic_Regression_classfifer.score(train_features_set,response_vector.values.ravel())
classfier_score

In [None]:
#test_features_set.sort_values('customer_ID')

In [None]:
predict_purchase=Logistic_Regression_classfifer.predict(test_features_set)
ctr=Logistic_Regression_classfifer.predict(test_features_set).sum()
ctr

In [None]:
customer_information = pd.DataFrame(columns=['customer_ID', 'record_type'])
policy_options=test_df1.ix[:,17:24]
customer_information['customer_ID'] = test_df1.customer_ID
customer_information['record_type'] = predict_purchase.astype(int)
predicted_output=pd.concat([customer_information,policy_options],axis=1)
predicted_output.to_csv('logisticregression.csv',index=False)


In [None]:
test_customer_information=pd.DataFrame(columns=['customer_ID', 'record_type'])
test_customer_information['customer_ID'] = test_df1.customer_ID
test_customer_information['record_type']=test_df1.record_type
expected_output=pd.concat([test_customer_information,test_df1.ix[:,17:24]], axis=1)
expected_output.to_csv('expectedOutput1.csv', index=False)

In [None]:
lr=pd.read_csv('logisticregression.csv')
lr.head()

In [None]:
exp=pd.read_csv('expectedOutput1.csv')
exp.loc[exp['record_type']==1].shape

In [None]:
lr.loc[exp['record_type']==0].shape

In [None]:
expected=test_df1.record_type


In [None]:
print(scipy.stats.pearsonr(dta.risk_factor,dta.record_type))

In [None]:
print(scipy.stats.pearsonr(dta.married_couple,dta.record_type))

In [None]:
print(scipy.stats.pearsonr(dta.homeowner,dta.record_type))

In [None]:
print(scipy.stats.pearsonr(dta.homeowner,dta.risk_factor))

In [None]:
print(recall_score(expected,lr.record_type))

In [None]:

print (classification_report(expected, lr.record_type))


In [None]:
print(accuracy_score(expected,lr.record_type))

In [None]:
print (roc_curve(expected, lr.record_type,pos_label=1))