## Imports

In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_validate
from sklearn.linear_model import LogisticRegression

In [72]:


data = pd.read_csv('preprocessed_data.csv')

y = data['Attrition']
X = data.drop('Attrition', axis = 1 )

X_train , X_test, y_train, y_test = train_test_split(X, y , train_size= 0.75, random_state=42, stratify = y)
#we make sure to do a stratified sampling because of the imbalanced dataset
#Stratification does NOT fix imbalance, it only preserves it.

print(f'the size of the training dataset is {X_train.shape}, and the test one is {X_test.shape}')

print(f'percentage of attrition in Training is {100 * len(y_train[y_train==1])/len(y_train)}% and in test {100 * len(y_test[y_test==1])/len(y_test)}% ')

the size of the training dataset is (1102, 40), and the test one is (368, 40)
percentage of attrition in Training is 16.1524500907441% and in test 16.032608695652176% 


## Models

We will go through a lot of models of classification, and organize the results in a dataFrame.

For each model we'll try to retrieve as much info as possible.

All models has be evaluated on the same data splits so comparisons are fair.

The nexxt step after splitting data between training and test is to understand how each model behaves.

This will be done by validation on the training data only first


In [73]:
results = pd.DataFrame(columns=['Mean_AUC','Mean_F1','Mean_Precision','Mean_Recall', 'Mean_Accuracy'])

scoring = {
    "auc": "roc_auc",
    "f1": "f1",
    "precision": "precision",
    "recall": "recall",
    "accuracy": "accuracy"
}

In [74]:
from sklearn.model_selection import StratifiedKFold

cv  = StratifiedKFold(n_splits = 5 , shuffle= True , random_state= 425)
#shuffle is true so data is shuffled before split and to protect in case data were ordered somehow by classes


In [81]:
def crossValidate(model_name, model, X_train, y_train, cross_info):

    cv_results = cross_validate(
        estimator=model,
        X=X_train,
        y=y_train,
        cv=cross_info,
        scoring=scoring,
    )

    aggregated_metrics = [value.mean() for (key, value) in cv_results.items()]
    results.loc[model_name] = aggregated_metrics[2:] # the first two values in the list corresponds to fit_time and score_time

"""
In logistic regression :
    C = 1/λ (where λ is the regularization strength)
    Inverse relationship: Smaller C means stronger regularization, larger C means weaker regularization
"""

models = {
    'simple_logistic_regression' : LogisticRegression(penalty= None, max_iter=1000, random_state=42),
    'logistic_regression_lasso' : LogisticRegression(penalty='l1' ,solver = 'liblinear', C = 1 , max_iter=1000, random_state=42), #the default solver lbfgs doesn't support l1 penalty
    'logistic_regression_ridge' : LogisticRegression(penalty='l2' , C = 1, max_iter=1000, random_state=42),
    'logistic_elastic_net' :  LogisticRegression(penalty='elasticnet',solver = 'saga', C=1, l1_ratio=0.5, max_iter= 1000, random_state=42) # onlly saga supports elasticnet

          }

In [82]:
for model_name , model in models.items() :
    crossValidate(model_name, model, X_train, y_train, cv)

results

Unnamed: 0,Mean_AUC,Mean_F1,Mean_Precision,Mean_Recall,Mean_Accuracy
simple_logistic_regression,0.845489,0.599558,0.746967,0.506825,0.891152
logistic_regression_lasso,0.847598,0.584464,0.76609,0.478889,0.891148
logistic_regression_ridge,0.847781,0.584744,0.771537,0.479206,0.892061
logistic_elastic_net,0.847624,0.576249,0.765281,0.467619,0.890239
