In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
#'Age','Attrition','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
#'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
#'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager'

features=['Age','DistanceFromHome','EnvironmentSatisfaction','Gender','HourlyRate','JobLevel','JobSatisfaction','MaritalStatus',
'MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','PerformanceRating','RelationshipSatisfaction','StockOptionLevel',
'TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']

In [4]:
#select features
data=df[features]
data_binary_encoded = pd.get_dummies(data)
X=data_binary_encoded
y=pd.get_dummies(df['Attrition'])

In [5]:
#Splitting and scaling the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [6]:
#Train the model
#from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model = model.fit(X_train_scaled, y_train)
print(f"Training score: {model.score(X_train_scaled,y_train)}")
print(f"Test score: {model.score(X_test_scaled,y_test)}")
#print(classification_report(y_report, predictions, target_names=["red","blue"]))


Training score: 0.8711433756805808
Test score: 0.8152173913043478


In [7]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors':[3,5,7,9,11,13,15,17,19],
             'weights':['uniform','distance'],
             'metric':['euclidean','manhattan']}
grid = GridSearchCV(KNeighborsClassifier(), param_grid, verbose=3)
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=3, weights=uniform; total time=   0.0s
[CV 1/5] END metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV 2/5] END metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV 3/5] END metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV 4/5] END metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV 5/5] END metric=euclidean, n_neighbors=3, weights=distance; total time=   0.0s
[CV 1/5] END metric=euclidean, n_neighbors=5, weights=uniform; total time=   0.0s
[CV 2/5] END metric=euclidean, 

[CV 1/5] END metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV 2/5] END metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV 3/5] END metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV 4/5] END metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV 5/5] END metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV 1/5] END metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV 2/5] END metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV 3/5] END metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV 4/5] END metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV 5/5] END metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV 1/5] END metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.0s
[CV 2/5] END metric=manhattan, n_neighbors=7, weights=uniform; total time=   0.0s
[CV 3/5] EN

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19],
                         'weights': ['uniform', 'distance']},
             verbose=3)

In [8]:
print(grid.best_params_)
print(grid.best_score_)

{'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'distance'}
0.8529740847387906


In [9]:
model2=grid.best_estimator_

In [10]:
test=model2.predict(X)

print(classification_report(y,test))


              precision    recall  f1-score   support

           0       0.84      1.00      0.91      1233
           1       0.00      0.00      0.00       237

   micro avg       0.84      0.84      0.84      1470
   macro avg       0.42      0.50      0.46      1470
weighted avg       0.70      0.84      0.77      1470
 samples avg       0.84      0.84      0.84      1470



  _warn_prf(average, modifier, msg_start, len(result))
