#### Required libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline
import threading
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

### Visualization

Open the train dataset and start to explore about it <br>
train data have labels and test data do not have


In [None]:
train_file_path = 'train.csv'
train_df = pd.read_csv(train_file_path)
print(train_df.shape)

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

There are no missing values in my data

#### Feature explanation:

**id**	 Unique ID for the customer <br>
**Gender**	 Gender of the customer <br>
**Age**	 Age of the customer <br>
**Driving_License**	 0 : Customer does not have DL, 1 : Customer already has DL <br>
**Region_Code**	 Unique code for the region of the customer <br>
**Previously_Insured**	 1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance <br>
**Vehicle_Age**	 Age of the Vehicle <br>
**Vehicle_Damage**	 1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past. <br>
**Annual_Premium**	 The amount customer needs to pay as premium in the year <br>
**Policy_Sales_Channel**	Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc. <br>
**Vintage**	 Number of Days, Customer has been associated with the company <br>
**Response**	1 : Customer is interested, 0 : Customer is not interested <br>

In [None]:
train_df.head(10)

#### Explore the categorical features

In [None]:
train_df['Gender'].value_counts()

In [None]:
def show_feature_dist(df, feature):    
    plt.figure(figsize=(4,4))
    value_counts = df[feature].value_counts()
    plt.bar(value_counts.index, value_counts.values, color=['mediumturquoise', 'orchid'])
    plt.xlabel(feature)
    plt.ylabel('Distribution')
    plt.show()

show_feature_dist(train_df, 'Gender')

In [None]:
train_df['Vehicle_Age'].value_counts()

In [None]:
def vehicle_age_pie_chart(df):    
    plt.figure(figsize=(4, 4))
    plt.pie(df['Vehicle_Age'].value_counts(),labels=train_df['Vehicle_Age'].value_counts().index, autopct='%1.1f%%', colors=['#66b3ff', '#99ff99', '#ffcc99'])
    plt.show()

vehicle_age_pie_chart(train_df)

In [None]:
train_df['Vehicle_Damage'].value_counts()

In [None]:
train_df['Driving_License'].value_counts()

Convert the categorical feature to numeric

In [None]:
def convert_feature_to_binary(df, mapping_key, value:str):
    # Check if the column already contains 0 or 1 before mapping
    if not set(df[value]).issubset({0, 1}):
        df[value] = df[value].map(mapping_key)

In [None]:
gender_mapping = {'Female': 1, 'Male': 0}
convert_feature_to_binary(train_df, gender_mapping, 'Gender')

In [None]:
Vehicle_mapping = {'Yes': 1, 'No': 0}
convert_feature_to_binary(train_df, Vehicle_mapping, 'Vehicle_Damage')

Handle with the categoric feature Vehicle_Age <br>
i need to encode the categorical feature so it will be possible to work with them in the model part

In [None]:
def encode_categoric_feature(df, feature)-> pd.DataFrame:
    encoder = OneHotEncoder(sparse=False, drop='first') # i drop the first va,ue of the categorical to avoid multicollinearity
    df_encoded_array = encoder.fit_transform(df[[feature]])

    # Create a DataFrame from the encoded array
    df_encoded = pd.DataFrame(df_encoded_array, columns=encoder.get_feature_names_out([feature]))
    df_encoded = df_encoded.astype(int)
    column_list = df_encoded.columns
    # Check if columns of train_df_encoded are in train_df
    if column_list[0] not in df.columns:
        new_df = pd.concat([df, df_encoded], axis=1)
        new_df = new_df.drop(feature, axis=1)
    return new_df

new_df = encode_categoric_feature(train_df, 'Vehicle_Age')

In [None]:
new_df.head(10)

id column is a column with unique values designed to identify the observations, but it has no role in my data visualization or processing the data so i am going to drop it.

In [None]:
id_column = new_df['id']
my_df = new_df.drop('id', axis=1)
my_df.head(10)

In [None]:
my_df.dtypes

#### Corelation Matrix 

In [None]:
matrix_fig = plt.figure(figsize=(8, 8))

sns.heatmap(data=my_df.corr(method="pearson", numeric_only=True), vmin=-1, vmax=1, annot=True, cmap="YlGnBu")

we can see that there is strong negative correlation between the feature vehicle damage in the past and the feature previously insured.<br> multiple reasons might cause to that , for example maybe the company does not want to insure a person who damaged his car(it may indicate that the person does not drive carefully), or maybe the insurance price is more expensive because his car was damaged so it's not worth for him to insure the vehicle.

In [None]:
def feature_dist(df):    
    fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15, 15))
    axes = axes.flatten()
    for i, col in enumerate(df.columns):
        #add kde curve to the histogram, providing a more continuous and smooth representation of the data distribution
        sns.histplot(df[col], kde=True, ax=axes[i])
        axes[i].set_title(f"Distribution of {col} Data")
    # tight_layout ensure that subplots fit within the figure area without overlapping or crowding    
    plt.tight_layout()
    plt.show()

feature_dist(my_df)

In [None]:
numeric_feature = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
def display_box_plot(df,feature_list):
    for feature in feature_list:
        fig = px.box(df,y=feature, title=f"Distrubution of {feature}")
        fig.update_layout(height=500, width=500)
        fig.show()

display_box_plot(my_df,numeric_feature)

As we can see in the box plot of annual premium there are outliers values.<br>
Also the feature does not seems with normal distribution so i need to considering standardize them.

### Preprocessing

* split the data to train and test, so i could evaluate my model performance in future 
* standardize my data, makes the features on the same scale
* handle with outliers 
* reduce dimension - not must, i should consider it

In [None]:
df_copy = my_df.copy()
y = df_copy['Response']
X = df_copy.drop('Response', axis=1)
X_train, X_test ,y_train, y_test= train_test_split(X, y, test_size = 0.20, random_state = 42, shuffle = True)

In [None]:
print(f"X train set shape: {X_train.shape}, y train set shape: {y_train.shape}")
print(f"X test set shape: {X_test.shape}, y test set shape: {y_test.shape}")

In [None]:
X_train.head()

#### Standardizing the data <br>
It is important to standards the data before using algorithms that rely on distances between data points

In [None]:
def scale_the_df(train_df, df_to_scale):    
    all_row_labels = df_to_scale.index.tolist()
    all_column_labels = df_to_scale.columns.tolist()
    
    scaler = RobustScaler()
    scaler.fit(train_df)
    scaled_array = scaler.transform(df_to_scale)
    # Convert NumPy array to Pandas DataFrame with labels
    data_frame = pd.DataFrame(scaled_array, index = all_row_labels, columns = all_column_labels)
    return data_frame

X_train_scaled = scale_the_df(train_df=X_train, df_to_scale=X_train)
X_test_scaled = scale_the_df(train_df=X_train, df_to_scale=X_test)


RobustScaler method should handle also with outliers

In [None]:
X_train_scaled.head()

In [None]:

X_test_scaled.head()

Lets look at the box-plot for numeric features again <br>
first i will combine the train and test scaled data and then make all the plots

In [None]:
scaled_df = pd.concat([X_train_scaled,X_test_scaled], axis=0)

In [None]:
scaled_df.head(10)

In [None]:
display_box_plot(scaled_df,numeric_feature)

### Models

* Use grid search to find the best parameters for the 3 models that i chose
* Evaluate the model using K-fold cross validation, f1 score
* Display ROC curve every model and its AUC score
* Show the confusion matrix of the best model


#### Use Grid Search method

In [None]:
# Define the hyperparameter grids for each model
svm_param_grid = {'C': [0.01, 0.1, 1]}
knn_param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
random_forest_param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4]
}

svm_classifier = SVC()
knn_classifier = KNeighborsClassifier()
rf_classifier = RandomForestClassifier()

About SVM model: <br>
Small C: A larger margin but more misclassifications are allowed.<br>
Large C: A smaller margin but fewer misclassifications are allowed.

#### Perform grid search using Thread pool method to efficient the process

In [None]:
from concurrent.futures import ThreadPoolExecutor
cv_param = 3

df = pd.concat([X_test_scaled, y_train],axis=0)
subset_fraction = 0.1
# Create a random subset of the data so the grid search will be faster
subset_df = df.sample(frac=subset_fraction, random_state=42)
print(subset_df)
subset_y = subset_df.iloc[:, -1]
print(subset_y)
subset_x = subset_df.iloc[:, :-1]
print(subset_x)

def use_grid_search(model, grid_params: dict, X, y):
    model_to_grid = GridSearchCV(model, grid_params, cv=cv_param, scoring='accuracy', n_jobs=-1)
    model_to_grid.fit(X, y)
    return model_to_grid

with ThreadPoolExecutor(max_workers=3) as executor:
    futures = [executor.submit(use_grid_search, svm_classifier, svm_param_grid, subset_x, subset_y),
    executor.submit(use_grid_search, knn_classifier, knn_param_grid, subset_x, subset_y),
    executor.submit(use_grid_search,rf_classifier, random_forest_param_grid, subset_x, subset_y)]

    
svm_grid = futures[0].result()
knn_grid = futures[1].result()
random_forest_grid = futures[2].result()


In [None]:
svm_grid = use_grid_search(svm_classifier, svm_param_grid, X_train_scaled, y_train)

In [None]:
knn_grid = use_grid_search(knn_classifier, knn_param_grid, X_train_scaled, y_train) 

In [None]:

random_forest_grid = use_grid_search(rf_classifier, random_forest_param_grid, X_train_scaled, y_train)


Get the best model:

In [None]:
best_svm_model = svm_grid.best_estimator_
print("Logistic Regression - Best Hyperparameter:", svm_grid.best_params_)

In [None]:
best_knn_model = knn_grid.best_estimator_
print("\nK-Nearest Neighbors - Best Hyperparameter:", knn_grid.best_params_)

In [None]:
best_random_forest_model = random_forest_grid.best_estimator_
print("\nRandom Forest - Best Hyperparameter:", random_forest_grid.best_params_)

saving the result for future because the grid search took for long time:<br>


### K-fold cross validation evaluation

In [None]:
def cross_val_model_evaluation(X, y, models_list, k_fold=5):    
    for model in models_list:
        scores = cross_val_score(model, X, y, cv=k_fold, scoring='accuracy')
        # Display the average performance score
        print(f"Average Accuracy of {model}:", scores.mean())

In [None]:
y_df = pd.concat([y_train, y_test], axis=0)

In [None]:
models_list = [best_svm_model, best_knn_model, best_random_forest_model]
cross_val_model_evaluation(scaled_df, y_df, models_list, k_fold=5)

Evaluate the models use f1 score

In [None]:
def F_one_score_evaluate_models(X_test, y_test, models_list):
    for model in models_list:
        y_pred = model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        print(f"f1 score for {model} is {f1}")

In [None]:
F_one_score_evaluate_models(X_test_scaled, y_test, models_list)

### Display ROC curve of the models and AUC score

In [None]:
def plot_roc_curve(models):
    for model in models:
        y_probability = model.predict_proba(X_test_scaled)
        fpr, tpr , thresholds = roc_curve(y_test , y_probability[ : , 1])
        plt.figure(figsize=(5,5))
        plt.plot(fpr, tpr, color='darkorange', label='ROC')
        plt.plot([0,1],[0,1], color='navy', linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend()
        plt.show()
        print('AUC:%.2f'%auc(fpr, tpr))

plot_roc_curve(models_list)