## Importing

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score
import zipfile
path = r"Data\creditcar.zip"
path = zipfile.ZipFile(path)
path = path.open('creditcard_new.csv')

In [30]:
data = pd.read_csv(path)
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,39885.0,1.385745,-0.776183,-1.276504,-1.732802,1.45601,3.169421,-0.884741,0.73074,-1.056017,...,-0.621874,-1.564708,0.123824,1.000517,0.209903,0.827916,-0.064076,0.009821,41.94,0
1,160679.0,-1.509015,1.445546,-0.288465,-2.599796,0.031683,-1.161685,0.473007,0.802321,-0.22238,...,-0.202615,-0.777421,-0.045253,-0.34845,0.00297,0.383022,0.076898,0.082978,4.0,0
2,170043.0,-1.329675,1.671851,-0.668253,-1.316477,0.888102,-0.419414,-0.257771,-2.296987,-0.658676,...,-1.505724,-0.501284,0.140584,0.347481,-0.345795,-0.072878,-0.079008,0.114091,1.29,0
3,29820.0,-1.300095,1.00477,1.226453,0.143031,0.302872,-0.473174,1.110003,0.105973,-0.758082,...,-0.357558,-1.149148,0.012885,0.003805,-0.212856,-1.001165,-0.249979,-0.272285,83.0,0
4,84933.0,-1.209733,1.249704,1.185233,-0.112195,0.388526,-0.921451,1.41046,-0.761339,0.480302,...,-0.018978,0.304157,-0.212971,0.377686,0.254317,-0.60618,-0.663626,-0.142206,28.0,0


In [3]:
X = data.drop(columns="Class")
y = data["Class"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [5]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)



In [6]:
accuracy = model.score(X_test, y_test)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)
accuracy, recall, precision, f1, confusion_mat

(0.9993270327998361,
 0.6949152542372882,
 0.8913043478260869,
 0.780952380952381,
 array([[34113,     5],
        [   18,    41]], dtype=int64))

In [7]:
from imblearn.under_sampling import RandomUnderSampler

In [8]:
rus = RandomUnderSampler(random_state=0)
X_sample2, y_sample2 = rus.fit_sample(X_train, y_train)
model_rus = LogisticRegression()

In [9]:
model_rus.fit(X_sample2, y_sample2)
y_pred = model_rus.predict(X_test)



In [10]:
accuracy_rus = model_rus.score(X_test, y_test)
recall_rus = recall_score(y_test, y_pred)
precision_rus = precision_score(y_test, y_pred)
f1_rus = f1_score(y_test, y_pred)
confusion_mat_rus = confusion_matrix(y_test, y_pred)
accuracy_rus, recall_rus, precision_rus, f1_rus, confusion_mat_rus

(0.9639816250694911,
 0.9152542372881356,
 0.0421875,
 0.08065720687079911,
 array([[32892,  1226],
        [    5,    54]], dtype=int64))

In [11]:
from imblearn.under_sampling import ClusterCentroids

In [12]:
cc = ClusterCentroids(random_state=0)
X_sample3, y_sample3 = cc.fit_sample(X_train, y_train)
model_cc = LogisticRegression()
model_cc.fit(X_sample3, y_sample3)
y_pred = model_cc.predict(X_test)



In [13]:
accuracy_cc = model_cc.score(X_test, y_test)
recall_cc = recall_score(y_test, y_pred)
precision_cc = precision_score(y_test, y_pred)
f1_cc = f1_score(y_test, y_pred)
confusion_mat_cc = confusion_matrix(y_test, y_pred)
accuracy_cc, recall_cc, precision_cc, f1_cc, confusion_mat_cc

(0.894870819557012,
 0.9661016949152542,
 0.015625,
 0.03075263015915835,
 array([[30527,  3591],
        [    2,    57]], dtype=int64))

In [14]:
from imblearn.under_sampling import TomekLinks

In [16]:
tl = TomekLinks(random_state=0)
X_sample4, y_sample4 = tl.fit_sample(X_train, y_train)
model_tl = LogisticRegression(random_state=0)
model_tl.fit(X_sample4, y_sample4)
y_pred = model_tl.predict(X_test)



In [17]:
accuracy_tl = model_tl.score(X_test, y_test)
recall_tl = recall_score(y_test, y_pred)
precision_tl = precision_score(y_test, y_pred)
f1_tl = f1_score(y_test, y_pred)
confusion_mat_tl = confusion_matrix(y_test, y_pred)
accuracy_tl, recall_tl, precision_tl, f1_tl, confusion_mat_tl

(0.999473330017263,
 0.7796610169491526,
 0.9019607843137255,
 0.8363636363636364,
 array([[34113,     5],
        [   13,    46]], dtype=int64))

In [18]:
from imblearn.over_sampling import RandomOverSampler

In [19]:
ros = RandomOverSampler(random_state=0)
X_sample5, y_sample5 = ros.fit_sample(X_train, y_train)
model_ros = LogisticRegression(random_state=0)
model_ros.fit(X_sample5, y_sample5)
y_pred = model_ros.predict(X_test)



In [20]:
accuracy_ros = model_ros.score(X_test, y_test)
recall_ros = recall_score(y_test, y_pred)
precision_ros = precision_score(y_test, y_pred)
f1_ros = f1_score(y_test, y_pred)
confusion_mat_ros = confusion_matrix(y_test, y_pred)
accuracy_ros, recall_ros, precision_ros, f1_ros, confusion_mat_ros

(0.9688094332445797,
 0.9152542372881356,
 0.0484304932735426,
 0.0919931856899489,
 array([[33057,  1061],
        [    5,    54]], dtype=int64))

In [21]:
from imblearn.over_sampling import SMOTE

In [22]:
smote = SMOTE(random_state=0)
X_sample6, y_sample6 = smote.fit_sample(X_train, y_train)
model_smote = LogisticRegression(random_state=0)
model_smote.fit(X_sample6, y_sample6)
y_pred = model_smote.predict(X_test)



In [23]:
accuracy_smote = model_smote.score(X_test, y_test)
recall_smote = recall_score(y_test, y_pred)
precision_smote = precision_score(y_test, y_pred)
f1_smote = f1_score(y_test, y_pred)
confusion_mat_smote = confusion_matrix(y_test, y_pred)
accuracy_smote, recall_smote, precision_smote, f1_smote, confusion_mat_smote

(0.9888814114755538,
 0.8983050847457628,
 0.12412177985948478,
 0.21810699588477367,
 array([[33744,   374],
        [    6,    53]], dtype=int64))

## Car insurance claim

In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [31]:
path = r"Data\car_insurance_claim.csv"

In [95]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,ID,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,GENDER,...,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CLM_AMT,CAR_AGE,CLAIM_FLAG
0,63581743,0,60.0,0,11.0,"$67,349",No,$0,z_No,M,...,11,Minivan,yes,"$4,461",2,No,3,$0,18.0,0
1,132761049,0,43.0,0,11.0,"$91,449",No,"$257,252",z_No,M,...,1,Minivan,yes,$0,0,No,0,$0,1.0,0
2,921317019,0,48.0,0,11.0,"$52,881",No,$0,z_No,M,...,1,Van,yes,$0,0,No,2,$0,10.0,0
3,727598473,0,35.0,1,10.0,"$16,039",No,"$124,191",Yes,z_F,...,4,z_SUV,no,"$38,690",2,No,3,$0,10.0,0
4,450221861,0,51.0,0,14.0,,No,"$306,251",Yes,M,...,7,Minivan,yes,$0,0,No,0,$0,6.0,0


In [105]:
df[['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT']] = df[['INCOME','HOME_VAL','BLUEBOOK','OLDCLAIM','CLM_AMT']].replace({'\$': '', ',': ''}, regex=True)

In [107]:
X = df.drop(columns="CLAIM_FLAG")
y = df["CLAIM_FLAG"]

In [108]:
count = y.value_counts()

In [145]:
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=6, test_size=0.3)

In [146]:
X_train = X_train.astype({"INCOME" : float, "HOME_VAL" : float, "BLUEBOOK" : float, "OLDCLAIM" : float, "CLM_AMT" : float})
X_test = X_test.astype({"INCOME" : float, "HOME_VAL" : float, "BLUEBOOK" : float, "OLDCLAIM" : float, "CLM_AMT" : float})

In [147]:
X_train = X_train.dropna(subset=['YOJ','OCCUPATION'])
X_test = X_test.dropna(subset=['YOJ','OCCUPATION'])
y_train = y_train[X_train.index]
y_test = y_test[X_test.index]

In [149]:
X_train[["AGE","CAR_AGE","INCOME", "HOME_VAL"]] = X_train[["AGE","CAR_AGE","INCOME", "HOME_VAL"]].fillna(X_train[["AGE","CAR_AGE","INCOME", "HOME_VAL"]].mean())
X_test[["AGE","CAR_AGE","INCOME", "HOME_VAL"]] = X_test[["AGE","CAR_AGE","INCOME", "HOME_VAL"]].fillna(X_test[["AGE","CAR_AGE","INCOME", "HOME_VAL"]].mean())

In [151]:
from sklearn.preprocessing import LabelEncoder
columns = ["PARENT1","MSTATUS","GENDER","EDUCATION","OCCUPATION","CAR_USE","CAR_TYPE","RED_CAR","REVOKED"]

In [152]:
for col in columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

In [154]:
model = LogisticRegression(random_state=6)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = model.score(X_test, y_test)
score



0.7427113702623906

In [155]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

In [156]:
smote = SMOTE(random_state=9)
X_train, y_train = smote.fit_sample(X_train, y_train)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [158]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = model.score(X_test, y_test)
score



0.9897959183673469