In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("processed_dataset.csv")

In [5]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_of_projects,average_monthly_hours,years_worked_at_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [7]:
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_of_projects',
       'average_monthly_hours', 'years_worked_at_company', 'Work_accident',
       'left', 'promotion_last_5years', 'Department', 'salary'],
      dtype='object')

In [9]:
categorical_columns = ['Work_accident', 'left', 'promotion_last_5years', 'Department', 'salary']
df = pd.get_dummies(df, columns = categorical_columns, drop_first = True)
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_of_projects,average_monthly_hours,years_worked_at_company,Work_accident_1,left_1,promotion_last_5years_1,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_low,salary_medium
0,0.38,0.53,2,157,3,False,True,False,False,False,False,False,False,False,True,False,False,True,False
1,0.8,0.86,5,262,6,False,True,False,False,False,False,False,False,False,True,False,False,False,True
2,0.11,0.88,7,272,4,False,True,False,False,False,False,False,False,False,True,False,False,False,True
3,0.72,0.87,5,223,5,False,True,False,False,False,False,False,False,False,True,False,False,True,False
4,0.37,0.52,2,159,3,False,True,False,False,False,False,False,False,False,True,False,False,True,False


In [11]:
from sklearn.model_selection import train_test_split
X = df.drop('left_1', axis=1)
y = df['left_1']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from imblearn.over_sampling import SMOTE
from collections import Counter
print("Original Dataset Counter : ", Counter(y_train))
smote = SMOTE(random_state = 42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print("Random Oversampling shape:", Counter(y_train_over))

Original Dataset Counter :  Counter({False: 9134, True: 2865})
Random Oversampling shape: Counter({False: 9134, True: 9134})


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [35]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C' : [0.01,0.1,1,10], 'solver' : ['lbfgs', 'liblinear']}
grid = GridSearchCV(LogisticRegression(max_iter = 1000), param_grid, cv = 5, verbose = 2)
grid.fit(X_train_smote, y_train_smote)
print("Best parameters : ", grid.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] END ...............................C=0.01, solver=lbfgs; total time=   1.0s
[CV] END ...............................C=0.01, solver=lbfgs; total time=   1.3s
[CV] END ...............................C=0.01, solver=lbfgs; total time=   0.9s
[CV] END ...............................C=0.01, solver=lbfgs; total time=   1.0s
[CV] END ...............................C=0.01, solver=lbfgs; total time=   0.8s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ...........................C=0.01, solver=liblinear; total time=   0.0s
[CV] END ................................C=0.1, solver=lbfgs; total time=   1.8s
[CV] END ................................C=0.1, s

In [39]:
tuned_model = LogisticRegression(C = 0.01, solver = 'lbfgs', max_iter = 1000)
tuned_model.fit(X_train_smote,y_train_smote)

In [41]:
y_pred = tuned_model.predict(X_test)

In [43]:
accuracy_score(y_test,y_pred)

0.768

In [45]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.92      0.76      0.83      2294
        True       0.50      0.79      0.62       706

    accuracy                           0.77      3000
   macro avg       0.71      0.78      0.72      3000
weighted avg       0.82      0.77      0.78      3000



In [47]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[1747  547]
 [ 149  557]]
