In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings 
warnings.filterwarnings("ignore")

dataset = pd.read_csv("/kaggle/input/car-insurance-data/Car_Insurance_Claim.csv")

dataset

In [None]:
dataset.info()

In [None]:
col_lower = [i.lower() for i in dataset.columns]

dataset.columns = col_lower

dataset = dataset

In [None]:
dataset.drop(["id"], axis = 1, inplace = True)

In [None]:
dataset.describe()

In [None]:
len(dataset.select_dtypes("object").columns)

In [None]:
len(dataset.select_dtypes("int64").columns) + len(dataset.select_dtypes("float64").columns)

In [None]:
for i in dataset.select_dtypes("int64").columns.append(dataset.select_dtypes("float64").columns):
    print(i)

In [None]:
sns.set(font_scale = 2)

In [None]:
fig, axis = plt.subplots(5, 2, figsize = (40, 50))

for col, ax in zip(dataset.select_dtypes("int64").columns.append(dataset.select_dtypes("float64").columns), axis.flat[:]):
    sns.violinplot(x = dataset[col], ax = ax)

In [None]:
fig, axis = plt.subplots(5, 2, figsize = (40, 50))

for col, ax in zip(dataset.select_dtypes("int64").columns.append(dataset.select_dtypes("float64").columns), axis.flat[:]):
    sns.kdeplot(data = dataset, x = col, ax = ax, hue = "outcome", fill = True)

In [None]:
fig, ax = plt.subplots(4, 2, figsize = (40, 50))

for col, ax in zip(dataset.select_dtypes("object").columns, ax.flat[:]):
    sns.countplot(data = dataset, x = col, ax = ax, hue = "outcome")

In [None]:
dataset.drop(["postal_code"], axis = 1, inplace = True)

In [None]:
dataset[:].isna().sum()

# Pre Processing

In [None]:
def misRes(col):
    return dataset.iloc[dataset[col][dataset[col].isna() == True].index, : ].outcome == 1.0

In [None]:
mis1 = misRes("credit_score")

mis1.value_counts()

In [None]:
mis2 = misRes("annual_mileage")

mis2.value_counts()

In [None]:
itsn = (mis1[:][mis1[:] == True].index).intersection(mis2[:][mis2[:] == True].index)

In [None]:
dataset.iloc[itsn, : ].head(2)

In [None]:
dataset.drop(dataset.index[itsn], axis = 0, inplace = True)

In [None]:
dataset = dataset.reset_index(drop = True)

dataset

In [None]:
mis1 = misRes("credit_score")

mis1.value_counts()

In [None]:
mis2 = misRes("annual_mileage")

mis2.value_counts()

In [None]:
dataset.drop(dataset.index[mis1[:][mis1 == True].index], axis = 0, inplace = True)

In [None]:
dataset = dataset.reset_index(drop = True)

dataset

In [None]:
mis2 = misRes("annual_mileage")

mis2.value_counts()

In [None]:
dataset.drop(dataset.index[mis2[:][mis2 == True].index], axis = 0, inplace = True)

In [None]:
dataset = dataset.reset_index(drop = True)

dataset

In [None]:
dataset.info()

In [None]:
dataset.isna().sum()

In [None]:
np.mean(dataset["credit_score"])

In [None]:
dataset["credit_score"] = dataset["credit_score"].fillna(np.mean(dataset["credit_score"]))

In [None]:
dataset["annual_mileage"] = dataset["annual_mileage"].fillna(int(np.mean(dataset["annual_mileage"])))

In [None]:
dataset.isna().sum()

In [None]:
dataset.info()

# Label Encoding

**Nomial Encoding** 

In [None]:
from sklearn.preprocessing import LabelEncoder

nomialEncoder = LabelEncoder()

dataset["gender"] = nomialEncoder.fit_transform(dataset["gender"])

**Ordinal Encoding**

In [None]:
def vehicle_type_encoder(x):
    if (x == "sedan"):
        return 1
    else:
        return 0

dataset["vehicle_type"] = dataset["vehicle_type"].apply(vehicle_type_encoder)

dataset["vehicle_type"].unique()

In [None]:
dataset

In [None]:
def vehicle_year_encoder(x):
    if (x == "after 2015"):
        return 1
    else: 
        return 0

dataset["vehicle_year"] = dataset["vehicle_year"].apply(vehicle_year_encoder)

dataset["vehicle_year"].unique()

In [None]:
dataset

In [None]:
dataset["age"].unique()

In [None]:
def age_encoder(x):
    if (x == "65+"):
        return 0
    elif (x == "40-64"):
        return 1
    elif (x == "26-39"):
        return 2
    else:
        return 3
    
dataset["age"] = dataset["age"].apply(age_encoder)

dataset["age"].unique()

In [None]:
dataset

In [None]:
def race_encoder(x):
    if (x == "minority"):
        return 0
    else:
        return 1

dataset["race"] =  dataset["race"].apply(race_encoder)

dataset["race"].unique()

In [None]:
dataset

In [None]:
def driving_exp_encoder(x):
    if (x == "30y+"):
        return 0
    elif (x == "20-29y"):
        return 1
    elif (x == "10-19y"):
        return 2
    else: 
        return 3
    
dataset["driving_experience"] = dataset["driving_experience"].apply(driving_exp_encoder)

dataset["driving_experience"].unique()

In [None]:
dataset

In [None]:
def edu_encoder(x):
    if (x == "none"):
        return 0
    elif (x == "high school"):
        return 1
    else: 
        return 2
    
dataset["education"] = dataset["education"].apply(edu_encoder)

dataset["education"].unique()

In [None]:
dataset

In [None]:
def income_encoder(x):
    if (x == "poverty"):
        return 0
    elif (x == "middle class"):
        return 1
    elif (x == "working class"):
        return 2
    else:
        return 3

dataset["income"] = dataset["income"].apply(income_encoder)

dataset["income"].unique()

In [None]:
dataset

In [None]:
dataset.info()

# Modelling

**Train Test Split**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[ : , : -1], dataset.iloc[ : , -1], train_size = .8, random_state = 2)

In [None]:
# x_train = StandardScaler().fit_transform(x_train)
# x_test = StandardScaler().fit_transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

models = {
#     "Logistic": LogisticRegression(),
    "xTree": ExtraTreesClassifier(),
    "rfClassifier": RandomForestClassifier(),
    "tree": DecisionTreeClassifier(max_depth = 5, criterion = "gini"),
    "knClassifier": KNeighborsClassifier(n_neighbors = 5),
    "gBoost": GradientBoostingClassifier(),
    "Ada Boost": AdaBoostClassifier(n_estimators = 150),
    "Bagging": BaggingClassifier(n_estimators = 150),
    "xgBoost": XGBClassifier(),
    "catBoost": CatBoostClassifier(logging_level = "Silent"),
    "lightGBM": LGBMClassifier(),
    "svm": SVC(),
}

accuracy_scores = []
predicted = []

for i in models:
    models[i].fit(x_train, y_train)
    y_pred = models[i].predict(x_test)
    accuracy_scores.append(int(accuracy_score(y_pred, y_test) * 100))
    predicted.append(y_pred)

for j, k in zip(accuracy_scores, models):
    print (' \n ', k, ' accuracy : ', j, ' %  ')

In [None]:
cc = RandomForestClassifier()

cc.fit(x_train, y_train)

# cc.predict(x_test)
importances = cc.feature_importances_
sorted_indices = np.argsort(importances)[::-1]

In [None]:
cc.feature_importances_

In [None]:
importances[np.argsort(importances)[::-1]]

In [None]:
import matplotlib.pyplot as plt
 
plt.subplots(figsize = (30,10))
plt.title('Feature Importance')
plt.bar(range(x_train.shape[1]), importances[sorted_indices], align='center')
plt.xticks(range(x_train.shape[1]), x_train.columns[sorted_indices], rotation = 60)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize = (25, 8))
ax = sns.barplot(x = list(models.keys()), y = accuracy_scores)
for i in ax.patches:
    width, height = i.get_width(), i.get_height()
    x, y = i.get_xy()
    ax.annotate(f'{round(height,2)}%', (x + width/2, y + height*1.02), ha='center')

print (list(models.keys())[accuracy_scores.index(max(accuracy_scores))], " : " ,max(accuracy_scores), " %")

In [None]:
from sklearn.metrics import classification_report

for i, j in zip(list(models.keys()), predicted):
    print (' \n \n ', i, ' : \n \n', classification_report(j, y_test))

In [None]:
x_train

In [None]:
models["catBoost"].feature_importances_