In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [21]:
df = pd.read_csv('aug_train.csv')
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,167647,Male,22,1,7.0,1,< 1 Year,No,2630.0,152.0,16,0
1,17163,Male,42,1,28.0,0,1-2 Year,Yes,43327.0,26.0,135,0
2,32023,Female,66,1,33.0,0,1-2 Year,Yes,35841.0,124.0,253,0
3,87447,Female,22,1,33.0,0,< 1 Year,No,27645.0,152.0,69,0
4,501933,Male,28,1,46.0,1,< 1 Year,No,29023.0,152.0,211,0


In [22]:
df["Gender"] = pd.Categorical(df["Gender"]).codes
df["Vehicle_Age"] = pd.Categorical(df["Vehicle_Age"]).codes
df["Vehicle_Damage"] = pd.Categorical(df["Vehicle_Damage"]).codes
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,167647,1,22,1,7.0,1,1,0,2630.0,152.0,16,0
1,17163,1,42,1,28.0,0,0,1,43327.0,26.0,135,0
2,32023,0,66,1,33.0,0,0,1,35841.0,124.0,253,0
3,87447,0,22,1,33.0,0,1,0,27645.0,152.0,69,0
4,501933,1,28,1,46.0,1,1,0,29023.0,152.0,211,0


In [23]:
df["Response"].value_counts()

Response
0    319553
1     62601
Name: count, dtype: int64

here we have the target response 0 and 1 where 1 is the minority class

In [24]:
train_X = df.drop("Response",axis=1)
train_X.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,167647,1,22,1,7.0,1,1,0,2630.0,152.0,16
1,17163,1,42,1,28.0,0,0,1,43327.0,26.0,135
2,32023,0,66,1,33.0,0,0,1,35841.0,124.0,253
3,87447,0,22,1,33.0,0,1,0,27645.0,152.0,69
4,501933,1,28,1,46.0,1,1,0,29023.0,152.0,211


In [25]:
train_Y = df["Response"]
train_Y.head()

0    0
1    0
2    0
3    0
4    0
Name: Response, dtype: int64

In [26]:
train_X,test_X = train_test_split(train_X,test_size=0.2,random_state=42)
train_Y,test_Y = train_test_split(train_Y,test_size=0.2,random_state=42)

In [27]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(train_X,train_Y)

In [29]:
pred = lg_model.predict(test_X)

print(classification_report(test_Y,pred))

              precision    recall  f1-score   support

           0       0.84      0.99      0.91     63789
           1       0.38      0.03      0.05     12642

    accuracy                           0.83     76431
   macro avg       0.61      0.51      0.48     76431
weighted avg       0.76      0.83      0.77     76431



as we can see since 1 is the minorty class we get a recall metric of 0.03 for that class
we will try to undersample the majorty class and see what happens

In [70]:
majority = df[df["Response"]==0]
majority.head()
new_df = df[df["Response"]==1]
e = new_df["Response"].count()
x = len(majority)
majority = majority.iloc[:e]
majority.count()
new_df = pd.concat([new_df,majority])

new_df["Response"].value_counts()

Response
1    62601
0    62601
Name: count, dtype: int64

so we got the majority class down to the minorty class

In [71]:
train_X = new_df.drop("Response",axis=1)
train_Y = new_df["Response"]

train_X,test_X = train_test_split(train_X,test_size=0.2,random_state=42)
train_Y,test_Y = train_test_split(train_Y,test_size=0.2,random_state=42)

lg_model.fit(train_X,train_Y)

pred = lg_model.predict(test_X)

print(classification_report(test_Y,pred))

              precision    recall  f1-score   support

           0       0.63      0.66      0.65     12430
           1       0.65      0.62      0.63     12611

    accuracy                           0.64     25041
   macro avg       0.64      0.64      0.64     25041
weighted avg       0.64      0.64      0.64     25041



sure we lost the recall on 0 but 1 was our target and now we have an actual classifier for insurance response