In [98]:
import pandas as pd
import numpy as np
import pickle
import seaborn as sb

In [99]:
df = pd.read_csv('test.csv')

In [100]:
df

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,381110,Male,25,1,11.0,1,< 1 Year,No,35786.0,152.0,53
1,381111,Male,40,1,28.0,0,1-2 Year,Yes,33762.0,7.0,111
2,381112,Male,47,1,28.0,0,1-2 Year,Yes,40050.0,124.0,199
3,381113,Male,24,1,27.0,1,< 1 Year,Yes,37356.0,152.0,187
4,381114,Male,27,1,28.0,1,< 1 Year,No,59097.0,152.0,297
...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,Female,26,1,37.0,1,< 1 Year,No,30867.0,152.0,56
127033,508143,Female,38,1,28.0,0,1-2 Year,Yes,28700.0,122.0,165
127034,508144,Male,21,1,46.0,1,< 1 Year,No,29802.0,152.0,74
127035,508145,Male,71,1,28.0,1,1-2 Year,No,62875.0,26.0,265


In [101]:
# Drop id, driving license
df = df.drop(['id', 'Driving_License'], axis=1)

In [102]:
# Convert to categories
df[['Region_Code', 'Policy_Sales_Channel']] = df[['Region_Code', 'Policy_Sales_Channel']].astype('object')

In [103]:
# Convert the region code and sales channel to labels, based on the labels created in the region and channel binning file
with open('sales_channels_labels', 'rb') as f:
    channel_labels = pickle.load(f)
    
with open('region_labels', 'rb') as f:
    region_labels = pickle.load(f)

In [105]:
df = pd.merge(df, channel_labels)

In [107]:
df = pd.merge(df, region_labels)

In [108]:
df.dtypes

Gender                         object
Age                             int64
Region_Code                    object
Previously_Insured              int64
Vehicle_Age                    object
Vehicle_Damage                 object
Annual_Premium                float64
Policy_Sales_Channel           object
Vintage                         int64
Policy_Sales_Channel_Label      int64
Region_Code_Labels              int32
dtype: object

In [110]:
df = df.drop(['Region_Code', 'Policy_Sales_Channel'], axis=1)

In [97]:
df.head(1)

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,index,Policy_Sales_Channel_Label,Region_Code_Labels
0,Male,25,11,1,< 1 Year,No,35786.0,152,53,0,1,3


In [111]:
df.dtypes

Gender                         object
Age                             int64
Previously_Insured              int64
Vehicle_Age                    object
Vehicle_Damage                 object
Annual_Premium                float64
Vintage                         int64
Policy_Sales_Channel_Label      int64
Region_Code_Labels              int32
dtype: object

In [112]:
df[['Policy_Sales_Channel_Label', 'Region_Code_Labels']] = df[['Policy_Sales_Channel_Label', 'Region_Code_Labels']].astype('object')

In [113]:
df.dtypes

Gender                         object
Age                             int64
Previously_Insured              int64
Vehicle_Age                    object
Vehicle_Damage                 object
Annual_Premium                float64
Vintage                         int64
Policy_Sales_Channel_Label     object
Region_Code_Labels             object
dtype: object

In [114]:
df = pd.get_dummies(df, drop_first=True)

In [115]:
df

Unnamed: 0,Age,Previously_Insured,Annual_Premium,Vintage,Gender_Male,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_Yes,Policy_Sales_Channel_Label_1,Policy_Sales_Channel_Label_2,Policy_Sales_Channel_Label_3,Policy_Sales_Channel_Label_4,Policy_Sales_Channel_Label_5,Policy_Sales_Channel_Label_6,Policy_Sales_Channel_Label_7,Region_Code_Labels_2,Region_Code_Labels_3,Region_Code_Labels_4,Region_Code_Labels_5
0,25,1,35786.0,53,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0
1,26,1,26686.0,230,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
2,25,1,20958.0,230,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0
3,24,1,29579.0,125,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0
4,22,1,28067.0,42,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127030,39,0,35237.0,183,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0
127031,28,0,2630.0,270,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0
127032,62,1,22114.0,24,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0
127033,56,1,37335.0,97,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0


In [116]:
# Save at this point
with open('test_df_binned_no_interaction', 'wb') as f:
    pickle.dump(df, f)

In [117]:
# Add interaction effects
x = df[['Policy_Sales_Channel_Label_1',
       'Policy_Sales_Channel_Label_2', 'Policy_Sales_Channel_Label_3',
       'Policy_Sales_Channel_Label_4', 'Policy_Sales_Channel_Label_5',
       'Policy_Sales_Channel_Label_6', 'Policy_Sales_Channel_Label_7']]

y = df[['Region_Code_Labels_2', 'Region_Code_Labels_3', 'Region_Code_Labels_4',
       'Region_Code_Labels_5']]

In [119]:
test = {}
for i in x.columns:
    for j in y.columns:
        test[i+'___'+j] = ((x[i]*y[j]).values)
        
additional_features = pd.DataFrame(test)

In [120]:
df[additional_features.columns] = additional_features

In [122]:
df.shape

(127035, 47)

In [123]:
# Save at this point
with open('test_df_binned_with_interaction', 'wb') as f:
    pickle.dump(df, f)

In [125]:
df.isna().sum()

Age                                                    0
Previously_Insured                                     0
Annual_Premium                                         0
Vintage                                                0
Gender_Male                                            0
Vehicle_Age_< 1 Year                                   0
Vehicle_Age_> 2 Years                                  0
Vehicle_Damage_Yes                                     0
Policy_Sales_Channel_Label_1                           0
Policy_Sales_Channel_Label_2                           0
Policy_Sales_Channel_Label_3                           0
Policy_Sales_Channel_Label_4                           0
Policy_Sales_Channel_Label_5                           0
Policy_Sales_Channel_Label_6                           0
Policy_Sales_Channel_Label_7                           0
Region_Code_Labels_2                                   0
Region_Code_Labels_3                                   0
Region_Code_Labels_4           