#### Required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import threading
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import roc_curve, auc, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

### Visualization

Open the train dataset and start to explore about it <br>
train data have labels and test data do not have


In [None]:
train_file_path = 'train.csv'
test_file_path = 'test.csv'
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)
print(train_df.shape)
print(test_df.shape)

In [None]:
test_df.info()

In [None]:
train_df.info()

There are no missing values in my data

Feature explanation:

**id**	 Unique ID for the customer <br>
**Gender**	 Gender of the customer <br>
**Age**	 Age of the customer <br>
**Driving_License**	 0 : Customer does not have DL, 1 : Customer already has DL <br>
**Region_Code**	 Unique code for the region of the customer <br>
**Previously_Insured**	 1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance <br>
**Vehicle_Age**	 Age of the Vehicle <br>
**Vehicle_Damage**	 1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past. <br>
**Annual_Premium**	 The amount customer needs to pay as premium in the year <br>
**Policy_Sales_Channel**	Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc. <br>
**Vintage**	 Number of Days, Customer has been associated with the company <br>
**Response**	1 : Customer is interested, 0 : Customer is not interested <br>

In [None]:
train_df.head(10)

#### Explore the categorical features

In [None]:
print(train_df['Gender'].value_counts())

In [None]:
plt.figure(figsize=(4,4))
value_counts = train_df['Gender'].value_counts()
plt.bar(value_counts.index, value_counts.values, color=['mediumturquoise', 'orchid'])
plt.xlabel('Gender')
plt.ylabel('Distribution')
plt.show()

In [None]:
print(train_df['Vehicle_Age'].value_counts())

In [None]:
plt.figure(figsize=(4, 4))
plt.pie(train_df['Vehicle_Age'].value_counts(),labels=train_df['Vehicle_Age'].value_counts().index, autopct='%1.1f%%', colors=['#66b3ff', '#99ff99', '#ffcc99'])
plt.show()

In [None]:
print(train_df['Vehicle_Damage'].value_counts())

Convert the categorical feature to numeric

In [None]:
def convert_feature_to_binary(df, mapping_key, value:str):
    # Check if the column already contains 0 or 1 before mapping
    if not set(df[value]).issubset({0, 1}):
        df[value] = df[value].map(mapping_key)

In [None]:
gender_mapping = {'Female': 1, 'Male': 0}
convert_feature_to_binary(train_df, gender_mapping, 'Gender')
convert_feature_to_binary(test_df, gender_mapping, 'Gender')


In [None]:
Vehicle_mapping = {'Yes': 1, 'No': 0}
convert_feature_to_binary(train_df, Vehicle_mapping, 'Vehicle_Damage')
convert_feature_to_binary(test_df, Vehicle_mapping, 'Vehicle_Damage')

Handle with the categoric feature Vehicle_Age

In [None]:
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first' to get k-1 dummies out of k categorical levels
def encode_categoric_feature(df, feature):
    df_encoded_array = encoder.fit_transform(df[[feature]])

    # Create a DataFrame from the encoded array
    df_encoded = pd.DataFrame(df_encoded_array, columns=encoder.get_feature_names_out([feature]))
    df_encoded = df_encoded.astype(int)
    print(train_df_encoded)
    # Check if columns of train_df_encoded are in train_df
    if 'Vehicle_Age_< 1 Year' not in train_df.columns:
        train_df = pd.concat([train_df, train_df_encoded], axis=1)
        train_df = train_df.drop('Vehicle_Age', axis=1)

df_encoded_array = encoder.fit_transform(train_df[['Vehicle_Age']])

# Create a DataFrame from the encoded array
train_df_encoded = pd.DataFrame(df_encoded_array, columns=encoder.get_feature_names_out(['Vehicle_Age']))
train_df_encoded = train_df_encoded.astype(int)
print(train_df_encoded)
# Check if columns of train_df_encoded are in train_df
if 'Vehicle_Age_< 1 Year' not in train_df.columns:
    train_df = pd.concat([train_df, train_df_encoded], axis=1)
    train_df = train_df.drop('Vehicle_Age', axis=1)

In [None]:
train_df.head()

In [None]:
test_df.head(10)