### Use WOE Encoding to convert region and sales channels into integers ###
**Notebook 3**

This notebook will:
1. Convert sales channel and region codes to numerical features using WOE Encoder, and save the resulting dataset to a pickle file
2. Add interaction between certain other categories, intergrate it with the WOE Encoded data, and save the data to a pickle file. 

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
with open('preprocessed_data_frame', 'rb') as f:
    df = pickle.load(f)

with open('target', 'rb') as f:
    target = pickle.load(f)

In [3]:
df.head(2)

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,Male,44,28,0,> 2 Years,Yes,40454.0,26,217
1,Male,76,3,0,1-2 Year,No,33536.0,26,183


In [4]:
df[['Region_Code', 'Policy_Sales_Channel']].nunique()

Region_Code              53
Policy_Sales_Channel    155
dtype: int64

In [5]:
target.head(2)

0    1
1    0
Name: Response, dtype: int64

In [6]:
# Trying WOE encoding
from category_encoders import WOEEncoder

In [7]:
woe = WOEEncoder(cols=['Region_Code', 'Policy_Sales_Channel'])
df = woe.fit_transform(df, target)

  elif pd.api.types.is_categorical(cols):


In [8]:
df

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,Male,44,0.499831,0,> 2 Years,Yes,40454.0,0.578246,217
1,Male,76,0.047270,0,1-2 Year,No,33536.0,0.578246,183
2,Male,47,0.499831,0,> 2 Years,Yes,38294.0,0.578246,27
3,Male,21,-0.093676,1,< 1 Year,No,28619.0,-1.555892,203
4,Female,29,-0.006989,1,< 1 Year,No,27496.0,-1.555892,39
...,...,...,...,...,...,...,...,...,...
381104,Male,74,-0.578859,1,1-2 Year,No,30170.0,0.578246,88
381105,Male,30,-0.482034,1,< 1 Year,No,40016.0,-1.555892,131
381106,Male,21,-0.560005,1,< 1 Year,No,35118.0,-1.832939,161
381107,Female,68,-0.340608,0,> 2 Years,Yes,44617.0,0.512838,74


In [9]:
# Convert the other categorical items to dummies
df = pd.get_dummies(df, drop_first=True)

In [10]:
df.columns

Index(['Age', 'Region_Code', 'Previously_Insured', 'Annual_Premium',
       'Policy_Sales_Channel', 'Vintage', 'Gender_Male',
       'Vehicle_Age_< 1 Year', 'Vehicle_Age_> 2 Years', 'Vehicle_Damage_Yes'],
      dtype='object')

In [11]:
# Rename the columns which have a '<' or a '>' in it, otherwise it will not work with XGBoost
df = df.rename(columns={'Vehicle_Age_< 1 Year':'Vehicle_Age_Under_1_Year', 'Vehicle_Age_> 2 Years': 'Vehicle_Age_Over_2_Years'})

In [12]:
df

Unnamed: 0,Age,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender_Male,Vehicle_Age_Under_1_Year,Vehicle_Age_Over_2_Years,Vehicle_Damage_Yes
0,44,0.499831,0,40454.0,0.578246,217,1,0,1,1
1,76,0.047270,0,33536.0,0.578246,183,1,0,0,0
2,47,0.499831,0,38294.0,0.578246,27,1,0,1,1
3,21,-0.093676,1,28619.0,-1.555892,203,1,1,0,0
4,29,-0.006989,1,27496.0,-1.555892,39,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
381104,74,-0.578859,1,30170.0,0.578246,88,1,0,0,0
381105,30,-0.482034,1,40016.0,-1.555892,131,1,1,0,0
381106,21,-0.560005,1,35118.0,-1.832939,161,1,1,0,0
381107,68,-0.340608,0,44617.0,0.512838,74,0,0,1,1


In [13]:
# Save to pickle
with open('WOE_Encoded_Data', 'wb') as f:
    pickle.dump(df, f)

In [14]:
df

Unnamed: 0,Age,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender_Male,Vehicle_Age_Under_1_Year,Vehicle_Age_Over_2_Years,Vehicle_Damage_Yes
0,44,0.499831,0,40454.0,0.578246,217,1,0,1,1
1,76,0.047270,0,33536.0,0.578246,183,1,0,0,0
2,47,0.499831,0,38294.0,0.578246,27,1,0,1,1
3,21,-0.093676,1,28619.0,-1.555892,203,1,1,0,0
4,29,-0.006989,1,27496.0,-1.555892,39,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
381104,74,-0.578859,1,30170.0,0.578246,88,1,0,0,0
381105,30,-0.482034,1,40016.0,-1.555892,131,1,1,0,0
381106,21,-0.560005,1,35118.0,-1.832939,161,1,1,0,0
381107,68,-0.340608,0,44617.0,0.512838,74,0,0,1,1


In [15]:
df['Previously_Insured'] = df['Previously_Insured'].astype('uint8')

In [16]:
# Make interaction effects between previously insured, vehicle age and gender
categories_only = df.loc[:, df.columns[df.dtypes == 'uint8']]

In [17]:
from sklearn.preprocessing import PolynomialFeatures

In [18]:
categories_only

Unnamed: 0,Previously_Insured,Gender_Male,Vehicle_Age_Under_1_Year,Vehicle_Age_Over_2_Years,Vehicle_Damage_Yes
0,0,1,0,1,1
1,0,1,0,0,0
2,0,1,0,1,1
3,1,1,1,0,0
4,1,0,1,0,0
...,...,...,...,...,...
381104,1,1,0,0,0
381105,1,1,1,0,0
381106,1,1,1,0,0
381107,0,0,0,1,1


Features to make: 
1. Previously Insured * Gender
3. Previously Insured * Vehicle Damage
5. Vehicle Age * Vehicle Damage
6. Gender * Vehicle Damage

In [19]:
df['Gender_Male']*df['Vehicle_Damage_Yes']

0         1
1         0
2         1
3         0
4         0
         ..
381104    0
381105    0
381106    0
381107    0
381108    0
Length: 381109, dtype: uint8

In [20]:
df['Previously_Insured_Gender'] = df['Previously_Insured']*df['Gender_Male']
df['Previously_Insured_Vehicle_Damage'] = df['Previously_Insured']*df['Vehicle_Damage_Yes']
df['Vehicle_Age_Under_1_Vehicle_Damage'] = df['Vehicle_Age_Under_1_Year']*df['Vehicle_Damage_Yes']
df['Vehicle_Age_Over_2_Vehicle_Damage'] =df['Vehicle_Age_Over_2_Years']*df['Vehicle_Damage_Yes']
df['Gender_Vehicle_Damage'] = df['Gender_Male']*df['Vehicle_Damage_Yes']

In [21]:
df

Unnamed: 0,Age,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender_Male,Vehicle_Age_Under_1_Year,Vehicle_Age_Over_2_Years,Vehicle_Damage_Yes,Previously_Insured_Gender,Previously_Insured_Vehicle_Damage,Vehicle_Age_Under_1_Vehicle_Damage,Vehicle_Age_Over_2_Vehicle_Damage,Gender_Vehicle_Damage
0,44,0.499831,0,40454.0,0.578246,217,1,0,1,1,0,0,0,1,1
1,76,0.047270,0,33536.0,0.578246,183,1,0,0,0,0,0,0,0,0
2,47,0.499831,0,38294.0,0.578246,27,1,0,1,1,0,0,0,1,1
3,21,-0.093676,1,28619.0,-1.555892,203,1,1,0,0,1,0,0,0,0
4,29,-0.006989,1,27496.0,-1.555892,39,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381104,74,-0.578859,1,30170.0,0.578246,88,1,0,0,0,1,0,0,0,0
381105,30,-0.482034,1,40016.0,-1.555892,131,1,1,0,0,1,0,0,0,0
381106,21,-0.560005,1,35118.0,-1.832939,161,1,1,0,0,1,0,0,0,0
381107,68,-0.340608,0,44617.0,0.512838,74,0,0,1,1,0,0,0,1,0


In [22]:
with open ('WOE_Encoded_with_Interaction', 'wb') as f:
    pickle.dump(df, f)