In [2]:
import pandas as pd
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

In [3]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [8]:
#'Age','Attrition','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
#'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
#'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'

features=['Age','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

In [23]:
#select features
data=df[features]
data_binary_encoded = pd.get_dummies(data)
X=data_binary_encoded
y=pd.get_dummies(df['Attrition'])

In [24]:
#Splitting and scaling the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [25]:
#Train the model
from sklearn import tree
model = tree.DecisionTreeClassifier()
model = model.fit(X_train_scaled, y_train)
print(f"Training score: {model.score(X_train_scaled,y_train)}")
print(f"Test score: {model.score(X_test_scaled,y_test)}")

Training score: 1.0
Test score: 0.7472826086956522


In [26]:
from sklearn.model_selection import GridSearchCV
param_grid = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
grid = GridSearchCV(tree.DecisionTreeClassifier(), param_grid, verbose=3)
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END ....................criterion=gini, max_depth=4; total time=   0.0s
[CV 2/5] END ....................criterion=gini, max_depth=4; total time=   0.0s
[CV 3/5] END ....................criterion=gini, max_depth=4; total time=   0.0s
[CV 4/5] END ....................criterion=gini, max_depth=4; total time=   0.0s
[CV 5/5] END ....................criterion=gini, max_depth=4; total time=   0.0s
[CV 1/5] END ....................criterion=gini, max_depth=5; total time=   0.0s
[CV 2/5] END ....................criterion=gini, max_depth=5; total time=   0.0s
[CV 3/5] END ....................criterion=gini, max_depth=5; total time=   0.0s
[CV 4/5] END ....................criterion=gini, max_depth=5; total time=   0.0s
[CV 5/5] END ....................criterion=gini, max_depth=5; total time=   0.0s
[CV 1/5] END ....................criterion=gini, max_depth=6; total time=   0.0s
[CV 2/5] END ....................criterion=gini

[CV 4/5] END .................criterion=entropy, max_depth=6; total time=   0.0s
[CV 5/5] END .................criterion=entropy, max_depth=6; total time=   0.0s
[CV 1/5] END .................criterion=entropy, max_depth=7; total time=   0.0s
[CV 2/5] END .................criterion=entropy, max_depth=7; total time=   0.0s
[CV 3/5] END .................criterion=entropy, max_depth=7; total time=   0.0s
[CV 4/5] END .................criterion=entropy, max_depth=7; total time=   0.0s
[CV 5/5] END .................criterion=entropy, max_depth=7; total time=   0.0s
[CV 1/5] END .................criterion=entropy, max_depth=8; total time=   0.0s
[CV 2/5] END .................criterion=entropy, max_depth=8; total time=   0.0s
[CV 3/5] END .................criterion=entropy, max_depth=8; total time=   0.0s
[CV 4/5] END .................criterion=entropy, max_depth=8; total time=   0.0s
[CV 5/5] END .................criterion=entropy, max_depth=8; total time=   0.0s
[CV 1/5] END ...............

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30,
                                       40, 50, 70, 90, 120, 150]},
             verbose=3)

In [27]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'entropy', 'max_depth': 4}
0.837531879884821


In [28]:
model2=grid.best_estimator_