## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing Dataset

In [5]:
df = pd.read_csv("processed_dataset.csv")

In [7]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_of_projects,average_monthly_hours,years_worked_at_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


## Encoding Categorical Columns

In [10]:
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_of_projects',
       'average_monthly_hours', 'years_worked_at_company', 'Work_accident',
       'left', 'promotion_last_5years', 'Department', 'salary'],
      dtype='object')

In [12]:
categorical_columns = ['Work_accident', 'left', 'promotion_last_5years', 'Department', 'salary']
df = pd.get_dummies(df, columns = categorical_columns, drop_first = True)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_of_projects,average_monthly_hours,years_worked_at_company,Work_accident_1,left_1,promotion_last_5years_1,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_low,salary_medium
0,0.38,0.53,2,157,3,False,True,False,False,False,False,False,False,False,True,False,False,True,False
1,0.8,0.86,5,262,6,False,True,False,False,False,False,False,False,False,True,False,False,False,True
2,0.11,0.88,7,272,4,False,True,False,False,False,False,False,False,False,True,False,False,False,True
3,0.72,0.87,5,223,5,False,True,False,False,False,False,False,False,False,True,False,False,True,False
4,0.37,0.52,2,159,3,False,True,False,False,False,False,False,False,False,True,False,False,True,False


## Performing a Stratified Split of Dataset

In [15]:
from sklearn.model_selection import train_test_split
X = df.drop('left_1', axis=1)
y = df['left_1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify = y, random_state=42)

## Upsampling the Training Dataset using the SMOTE Technique

In [20]:
from imblearn.over_sampling import SMOTE
from collections import Counter
print("Original Dataset Counter : ", Counter(y_train))
smote = SMOTE(random_state = 42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("Random Oversampling Counter : ", Counter(y_train_smote))

Original Dataset Counter :  Counter({False: 9142, True: 2857})
Random Oversampling Counter :  Counter({False: 9142, True: 9142})


## Building a Logistic Regression Model

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [25]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C' : [0.01,0.1,1,10], 'solver' : ['lbfgs', 'liblinear']}
grid = GridSearchCV(LogisticRegression(max_iter = 1000), param_grid, cv = 5, verbose = 2)
grid.fit(X_train_smote, y_train_smote)
print("Best parameters : ", grid.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ...............................C=0.01, solver=lbfgs; total time=   0.5s
[CV] END ...............................C=0.01, solver=lbfgs; total time=   0.4s
[CV] END ...............................C=0.01, solver=lbfgs; total time=   0.3s
[CV] END ...............................C=0.01, solver=lbfgs; total time=   0.3s
[CV] END ...............................C=0.01, solver=lbfgs; total time=   0.3s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ................................C=0.1, solver=lbfgs; total time=   0.8s
[CV] END ................................C=0.1, s

In [27]:
tuned_model = LogisticRegression(C = 0.01, solver = 'lbfgs', max_iter = 1000)
tuned_model.fit(X_train_smote,y_train_smote)

In [28]:
y_pred = tuned_model.predict(X_test)

In [31]:
accuracy_score(y_test,y_pred)

0.7733333333333333

In [33]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.92      0.77      0.84      2286
        True       0.52      0.78      0.62       714

    accuracy                           0.77      3000
   macro avg       0.72      0.78      0.73      3000
weighted avg       0.82      0.77      0.79      3000



In [35]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[1760  526]
 [ 154  560]]


### Building a Random Forest Classifier Model

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [65]:
param_grid_rf  = {'n_estimators' : [20,50,75], 'max_depth' : [2,3,5], 'min_samples_split' : [10,15,30], 'min_samples_leaf' : [10,15,20] }
grid_search_rf = GridSearchCV(RandomForestClassifier(),param_grid_rf, cv = 5, verbose = 2)
grid_search_rf.fit(X_train_smote,y_train_smote)
print("Best parameters : ", grid_search_rf.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=2, min_samples_leaf=10, min_samples_split=10, n_estimators=20; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=10, min_samples_split=10, n_estimators=20; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=10, min_samples_split=10, n_estimators=20; total time=   0.0s
[CV] END max_depth=2, min_samples_leaf=10, min_samples_split=10, n_estimators=20; total time=   0.0s
[CV] END max_depth=2, min_samples_leaf=10, min_samples_split=10, n_estimators=20; total time=   0.0s
[CV] END max_depth=2, min_samples_leaf=10, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=10, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=10, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END max_depth=2, min_samples_leaf=10, min_samples_split=10, n_estimators=50; total time=   0.1s
[CV] END max_depth=2, min_sam

In [67]:
tuned_rf_model = RandomForestClassifier(n_estimators = 75, max_depth = 5, min_samples_leaf = 10, min_samples_split = 30)
tuned_rf_model.fit(X_train_smote, y_train_smote)

In [69]:
y_pred_rf = tuned_rf_model.predict(X_test)

In [71]:
accuracy_score(y_test,y_pred_rf)

0.955

In [73]:
print(classification_report(y_test,y_pred_rf))

              precision    recall  f1-score   support

       False       0.98      0.97      0.97      2286
        True       0.89      0.92      0.91       714

    accuracy                           0.95      3000
   macro avg       0.93      0.94      0.94      3000
weighted avg       0.96      0.95      0.96      3000



##### The above model seems to have overfitted

In [75]:
tuned_rf_generalised_model = RandomForestClassifier(n_estimators = 50, max_depth = 3, min_samples_leaf = 40, min_samples_split = 60, max_features = 2)
tuned_rf_generalised_model.fit(X_train_smote, y_train_smote)

In [77]:
y_pred_rf_generalised = tuned_rf_generalised_model.predict(X_test)

In [79]:
accuracy_score(y_test,y_pred_rf_generalised)

0.8766666666666667

In [83]:
print(classification_report(y_test,y_pred_rf_generalised))

              precision    recall  f1-score   support

       False       0.91      0.93      0.92      2286
        True       0.76      0.71      0.73       714

    accuracy                           0.88      3000
   macro avg       0.83      0.82      0.83      3000
weighted avg       0.87      0.88      0.88      3000



### Building a Gradient Boosting Classifier Model

In [93]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [97]:
adaboost = AdaBoostClassifier(estimator = DecisionTreeClassifier(), random_state=42)

In [105]:
param_grid = {'n_estimators': [25,50,75],'learning_rate': [0.01, 0.1, 1]}
grid_search_ada = GridSearchCV(
    estimator=adaboost,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [109]:
grid_search_ada.fit(X_train_smote, y_train_smote)
print("Best parameters : ", grid_search_ada.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters :  {'learning_rate': 0.01, 'n_estimators': 25}


