In [23]:
import pandas as pd
from category_encoders import WOEEncoder
import pickle

In [24]:
df = pd.read_csv('test.csv')

In [25]:
df

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,381110,Male,25,1,11.0,1,< 1 Year,No,35786.0,152.0,53
1,381111,Male,40,1,28.0,0,1-2 Year,Yes,33762.0,7.0,111
2,381112,Male,47,1,28.0,0,1-2 Year,Yes,40050.0,124.0,199
3,381113,Male,24,1,27.0,1,< 1 Year,Yes,37356.0,152.0,187
4,381114,Male,27,1,28.0,1,< 1 Year,No,59097.0,152.0,297
...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,Female,26,1,37.0,1,< 1 Year,No,30867.0,152.0,56
127033,508143,Female,38,1,28.0,0,1-2 Year,Yes,28700.0,122.0,165
127034,508144,Male,21,1,46.0,1,< 1 Year,No,29802.0,152.0,74
127035,508145,Male,71,1,28.0,1,1-2 Year,No,62875.0,26.0,265


In [26]:
df.set_index('id', inplace=True)

In [27]:
# We have seen that we don't need id and driving license. Drop these two
df = df.drop(['Driving_License'], axis=1)

In [28]:
# Region Code and sales channel should be categories, not integers. 
df[['Region_Code', 'Policy_Sales_Channel']] = df[['Region_Code', 'Policy_Sales_Channel']].astype('object')

In [29]:
with open ('woe_encoder_fitted', 'rb') as f:
    woe = pickle.load(f)

In [30]:
df = woe.transform(df)

In [31]:
# Convert the other categorical items to dummies
df = pd.get_dummies(df, drop_first=True)

In [32]:
df

Unnamed: 0_level_0,Age,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender_Male,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_Yes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
381110,25,-0.093676,1,35786.0,-1.555892,53,1,1,0,0
381111,40,0.499831,0,33762.0,-0.078471,111,1,0,0,1
381112,47,0.499831,0,40050.0,0.512838,199,1,0,0,1
381113,24,-0.553572,1,37356.0,-1.555892,187,1,1,0,1
381114,27,0.499831,1,59097.0,-1.555892,297,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...
508142,26,-0.482034,1,30867.0,-1.555892,56,0,1,0,0
508143,38,0.499831,0,28700.0,0.405771,165,0,0,0,1
508144,21,-0.196729,1,29802.0,-1.555892,74,1,1,0,0
508145,71,0.499831,1,62875.0,0.578246,265,1,0,0,0


In [33]:
# Rename the columns which have a '<' or a '>' in it, otherwise it will not work with XGBoost
df = df.rename(columns={'Vehicle_Age_< 1 Year':'Vehicle_Age_Under_1_Year', 'Vehicle_Age_> 2 Years': 'Vehicle_Age_Over_2_Years'})

In [34]:
categories_only = df.loc[:, df.columns[df.dtypes == 'uint8']]

In [35]:
df['Previously_Insured_Gender'] = df['Previously_Insured']*df['Gender_Male']
df['Previously_Insured_Vehicle_Damage'] = df['Previously_Insured']*df['Vehicle_Damage_Yes']
df['Vehicle_Age_Under_1_Vehicle_Damage'] = df['Vehicle_Age_Under_1_Year']*df['Vehicle_Damage_Yes']
df['Vehicle_Age_Over_2_Vehicle_Damage'] =df['Vehicle_Age_Over_2_Years']*df['Vehicle_Damage_Yes']
df['Gender_Vehicle_Damage'] = df['Gender_Male']*df['Vehicle_Damage_Yes']

In [36]:
df

Unnamed: 0_level_0,Age,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender_Male,Vehicle_Age_Under_1_Year,Vehicle_Age_Over_2_Years,Vehicle_Damage_Yes,Previously_Insured_Gender,Previously_Insured_Vehicle_Damage,Vehicle_Age_Under_1_Vehicle_Damage,Vehicle_Age_Over_2_Vehicle_Damage,Gender_Vehicle_Damage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
381110,25,-0.093676,1,35786.0,-1.555892,53,1,1,0,0,1,0,0,0,0
381111,40,0.499831,0,33762.0,-0.078471,111,1,0,0,1,0,0,0,0,1
381112,47,0.499831,0,40050.0,0.512838,199,1,0,0,1,0,0,0,0,1
381113,24,-0.553572,1,37356.0,-1.555892,187,1,1,0,1,1,1,1,0,1
381114,27,0.499831,1,59097.0,-1.555892,297,1,1,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508142,26,-0.482034,1,30867.0,-1.555892,56,0,1,0,0,0,0,0,0,0
508143,38,0.499831,0,28700.0,0.405771,165,0,0,0,1,0,0,0,0,0
508144,21,-0.196729,1,29802.0,-1.555892,74,1,1,0,0,1,0,0,0,0
508145,71,0.499831,1,62875.0,0.578246,265,1,0,0,0,1,0,0,0,0


In [37]:
with open('test_df_WOE_with_interaction', 'wb') as f:
    pickle.dump(df, f)