In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv(r'dataset_2.csv')
df.head()

Unnamed: 0,Attrition,Female,OverTime,a.Travel_Frequently,a.Travel_Rarely,b.HR,b.Sales,c.Married,c.Single,d.Human_Resources,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1,1,1,0,1,0,1,0,1,0,...,3,1,0,8,0,1,6,4,0,5
1,0,0,0,1,0,0,0,1,0,0,...,4,4,1,10,3,3,10,7,1,7
2,1,0,1,0,1,0,0,0,1,0,...,3,2,0,7,3,3,0,0,0,0
3,0,1,1,1,0,0,0,1,0,0,...,3,3,0,8,3,3,8,7,3,0
4,0,0,0,0,1,0,0,1,0,0,...,3,4,1,6,3,3,2,2,2,2


In [3]:
y = df['Attrition']
X = df.drop(['Attrition'], axis = 1)
cols = X.columns

In [4]:
X = preprocessing.scale(X)
X = pd.DataFrame(X)
X.columns = cols

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train.shape

(1176, 44)

# SMOTE
As the dataset is imbalanced, we apply SMOTE to oversample the data.

In [6]:
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
X_train.shape

(1976, 44)

# Logistic Regression

In [7]:
logreg = LogisticRegression(random_state = 0)
logreg.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [8]:
y_pred_lr = logreg.predict(X_test)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred_lr))

Confusion Matrix
[[187  58]
 [ 13  36]]


In [9]:
print('Accuracy:', logreg.score(X_test, y_test))
print('Precision:', precision_score(y_test, y_pred_lr))
print('Recall:', recall_score(y_test, y_pred_lr))
print('F1 score:', f1_score(y_test, y_pred_lr))

Accuracy: 0.7585034013605442
Precision: 0.3829787234042553
Recall: 0.7346938775510204
F1 score: 0.5034965034965034


### Logit

In [10]:
logr_coef = pd.DataFrame({'feature': cols, 'logit': logreg.coef_[0]})
logr_coef.sort_values(by = ['logit'], ascending = False)

Unnamed: 0,feature,logit
15,e.Lab_Technician,1.22398
19,e.Sales_Executive,1.17634
20,e.Sales_Rep,1.010745
14,e.Human_Resources,1.003749
2,a.Travel_Frequently,0.90717
18,e.Research_Scientist,0.879188
1,OverTime,0.826934
3,a.Travel_Rarely,0.711154
7,c.Single,0.702324
40,YearsAtCompany,0.650868


# Random Forest Classifier

In [11]:
rfc = RandomForestClassifier(random_state = 0)
rfc.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [12]:
y_pred_rfc = rfc.predict(X_test)
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred_rfc))

Confusion Matrix
[[237   8]
 [ 34  15]]


In [13]:
print('Accuracy:', rfc.score(X_test, y_test))
print('Precision:', precision_score(y_test, y_pred_rfc))
print('Recall:', recall_score(y_test, y_pred_rfc))
print('F1 score:', f1_score(y_test, y_pred_rfc))

Accuracy: 0.8571428571428571
Precision: 0.6521739130434783
Recall: 0.30612244897959184
F1 score: 0.4166666666666667


### Feature Importance

In [14]:
rfc_fi = pd.DataFrame({'feature': cols, 'importance': rfc.feature_importances_})
rfc_fi.sort_values(by = ['importance'], ascending = False)

Unnamed: 0,feature,importance
1,OverTime,0.094771
36,StockOptionLevel,0.047303
43,YearsWithCurrManager,0.045905
28,JobLevel,0.040346
25,EnvironmentSatisfaction,0.039993
21,Age,0.039978
29,JobSatisfaction,0.039846
7,c.Single,0.03963
37,TotalWorkingYears,0.037974
40,YearsAtCompany,0.037142
