### Importing Libraries

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

#### Reading csv

In [2]:
df=pd.read_csv("atr_emp.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Leaver(Y/N),Age,Annual_Salary,CurrentPayGrade,TimewithCompany(yrs),DistancetoWork(miles),Gender_ Female,Gender_ Male,Gender_Female,...,Dept_Name_IT,Dept_Name_Marketing,Dept_Name_Membership,Dept_Name_Mission Delivery,Dept_Name_Outdoor Education,Dept_Name_Outdoor Program,Dept_Name_Outdoor Property,Dept_Name_Product Programs,Dept_Name_Program Support,Dept_Name_Risk and Facilities
0,0,0,35,21,11,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,31,21,7,1,46,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2,1,63,9547,13,4,42,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,3,1,29,10400,13,2,22,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,1,33,10400,13,8,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df=df.drop('Unnamed: 0',axis=1)

In [4]:
x=df.drop('Leaver(Y/N)',axis=1)
x

Unnamed: 0,Age,Annual_Salary,CurrentPayGrade,TimewithCompany(yrs),DistancetoWork(miles),Gender_ Female,Gender_ Male,Gender_Female,Dept_Name_Badge & Sash,Dept_Name_Business Systems,...,Dept_Name_IT,Dept_Name_Marketing,Dept_Name_Membership,Dept_Name_Mission Delivery,Dept_Name_Outdoor Education,Dept_Name_Outdoor Program,Dept_Name_Outdoor Property,Dept_Name_Product Programs,Dept_Name_Program Support,Dept_Name_Risk and Facilities
0,35,21,11,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,31,21,7,1,46,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,63,9547,13,4,42,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,29,10400,13,2,22,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,33,10400,13,8,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,56,118560,3,7,30,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
360,49,136736,3,4,14,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
361,23,145600,0,14,92,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
362,24,145600,0,9,13,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [5]:
y=df['Leaver(Y/N)']
y

0      0
1      1
2      1
3      1
4      1
      ..
359    0
360    0
361    1
362    1
363    1
Name: Leaver(Y/N), Length: 364, dtype: int64

##### Train Test Split

In [6]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

#### Decision Tree Classifier

In [7]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [8]:
model_dt.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [9]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1])

In [10]:
model_dt.score(x_test,y_test)

0.6575342465753424

In [11]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.53      0.38      0.44        26
           1       0.70      0.81      0.75        47

    accuracy                           0.66        73
   macro avg       0.62      0.60      0.60        73
weighted avg       0.64      0.66      0.64        73



###### As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy alone as our metrics to measure the model.

###### Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low.

###### Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

In [12]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_sample(x,y)

In [13]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)

In [14]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [15]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.875
              precision    recall  f1-score   support

           0       0.88      0.88      0.88        17
           1       0.87      0.87      0.87        15

    accuracy                           0.88        32
   macro avg       0.87      0.87      0.87        32
weighted avg       0.88      0.88      0.88        32



In [16]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[15  2]
 [ 2 13]]


#### Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

In [18]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [19]:
model_rf.fit(x_train,y_train)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [20]:
y_pred=model_rf.predict(x_test)

In [21]:
model_rf.score(x_test,y_test)

0.6575342465753424

In [22]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.67      0.08      0.14        26
           1       0.66      0.98      0.79        47

    accuracy                           0.66        73
   macro avg       0.66      0.53      0.46        73
weighted avg       0.66      0.66      0.56        73



In [23]:
sm = SMOTEENN()
X_resampled1, y_resampled1 = sm.fit_sample(x,y)

In [24]:
xr_train1,xr_test1,yr_train1,yr_test1=train_test_split(X_resampled1, y_resampled1,test_size=0.2)

In [25]:
model_rf_smote=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [26]:
model_rf_smote.fit(xr_train1,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [27]:
yr_predict1 = model_rf_smote.predict(xr_test1)

In [28]:
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

In [29]:
print(model_score_r1)
print(metrics.classification_report(yr_test1, yr_predict1))

0.8064516129032258
              precision    recall  f1-score   support

           0       0.71      0.83      0.77        12
           1       0.88      0.79      0.83        19

    accuracy                           0.81        31
   macro avg       0.80      0.81      0.80        31
weighted avg       0.82      0.81      0.81        31



In [30]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[10  2]
 [ 4 15]]


#### Performing PCA

In [31]:
# Applying PCA
from sklearn.decomposition import PCA
pca = PCA(0.9)
xr_train_pca = pca.fit_transform(xr_train1)
xr_test_pca = pca.transform(xr_test1)
explained_variance = pca.explained_variance_ratio_

In [32]:
model=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)

In [33]:
model.fit(xr_train_pca,yr_train1)

RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [34]:
yr_predict_pca = model.predict(xr_test_pca)

In [35]:
model_score_r_pca = model.score(xr_test_pca, yr_test1)

In [36]:
print(model_score_r_pca)
print(metrics.classification_report(yr_test1, yr_predict_pca))

0.8709677419354839
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        12
           1       0.89      0.89      0.89        19

    accuracy                           0.87        31
   macro avg       0.86      0.86      0.86        31
weighted avg       0.87      0.87      0.87        31



#### Pickling the model

In [37]:
import pickle

In [38]:
filename = 'model.sav'

In [39]:
pickle.dump(model_rf_smote, open(filename, 'wb'))

In [40]:
load_model = pickle.load(open(filename, 'rb'))

In [41]:
model_score_r1 = load_model.score(xr_test1, yr_test1)

In [42]:
model_score_r1

0.8064516129032258