# **Model Training**

## **Data Preprocessing**

In [254]:
# importing all necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline

In [255]:
# reading data
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [256]:
df.shape

(381109, 12)

In [257]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 381109 entries, 0 to 381108
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    381109 non-null  int64  
 1   Gender                381109 non-null  object 
 2   Age                   381109 non-null  int64  
 3   Driving_License       381109 non-null  int64  
 4   Region_Code           381109 non-null  float64
 5   Previously_Insured    381109 non-null  int64  
 6   Vehicle_Age           381109 non-null  object 
 7   Vehicle_Damage        381109 non-null  object 
 8   Annual_Premium        381109 non-null  float64
 9   Policy_Sales_Channel  381109 non-null  float64
 10  Vintage               381109 non-null  int64  
 11  Response              381109 non-null  int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 34.9+ MB


In [258]:
# removing unwanted columns
columns_to_remove = ['id', 'Driving_License']
df.drop(columns=columns_to_remove, inplace=True)

In [259]:
# unique values in region code
print("There are", df['Region_Code'].nunique(), "unique values of region code")

There are 53 unique values of region code


In [260]:
# reducing categories in region code column 
region_counts = df['Region_Code'].value_counts()
df['Region_Code'] = [i if region_counts[i] >= 6280 else 1.0 for i in df['Region_Code']]

In [261]:
df['Region_Code'].unique()

array([28.,  3., 11., 41., 33.,  6., 35., 50., 15.,  1.,  8., 36., 30.,
       47., 29., 46.])

In [262]:
# changing datatype to str
df['Region_Code'] = df['Region_Code'].astype(str)

In [263]:
# reducing categories in region code column
policy_sales_channel_count = df['Policy_Sales_Channel'].value_counts()
df['Policy_Sales_Channel'] = [i if policy_sales_channel_count[i] >= 1360 else 1.0 for i in df['Policy_Sales_Channel']]

In [264]:
df['Policy_Sales_Channel'].unique()

array([ 26., 152., 160., 124.,   1.,  13.,  30., 156., 163., 157., 122.,
       154., 151.,  25.,   7.,   8.])

In [265]:
# changing datatype to str
df['Policy_Sales_Channel'] =df['Policy_Sales_Channel'].astype(str)

In [266]:
df.head()

Unnamed: 0,Gender,Age,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,Male,44,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,Male,76,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,Male,47,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,Male,21,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,Female,29,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [267]:
# splitting data into X and y
X = df.drop(columns='Response')
y = df['Response']

In [268]:
# splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [269]:
columns_to_label_encode = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
columns_to_onehot_encode = ['Region_Code', 'Policy_Sales_Channel']
columns_to_scale = ['Age', 'Vintage']

In [270]:
# Define a function for label encoding
def label_encode_columns(X):
    for col in columns_to_label_encode:
        X[col] = LabelEncoder().fit_transform(X[col])
    return X

In [271]:
preprocessor = ColumnTransformer(
    [
        ('Label_encoding', FunctionTransformer(label_encode_columns, validate=False), columns_to_label_encode),
        ('Onehot_encoding', OneHotEncoder(), columns_to_onehot_encode),
        ('min_max_scaling', MinMaxScaler(), columns_to_scale)
    ]
)

In [272]:
X_train = preprocessor.fit_transform(X_train)

In [273]:
random_under_sampler = RandomUnderSampler()
X_resampled, y_resampled = random_under_sampler.fit_resample(X_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import xgbclassifier

lr = LogisticRegression()
lr.fit(X_train, y_train)

In [278]:
X_test = preprocessor.transform(X_test)

In [280]:
y_pred = lr.predict(X_test)

In [283]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score(y_pred, y_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.88      0.93     95268
           1       0.00      0.30      0.00        10

    accuracy                           0.88     95278
   macro avg       0.50      0.59      0.47     95278
weighted avg       1.00      0.88      0.93     95278

