In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import sklearn 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier

# 1. Gather Data

In [2]:
# Read in training and testing data using
data = pd.read_csv("Employee.csv")

# See a sample of the dataset
data.sample(5)

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
3904,Masters,2012,New Delhi,3,34,Female,No,3,0
4010,Bachelors,2018,Bangalore,3,32,Female,No,2,1
500,Bachelors,2015,New Delhi,3,25,Male,No,3,1
2977,Masters,2013,Pune,1,30,Male,No,1,0
1256,Masters,2017,Bangalore,2,24,Male,No,2,1


In [3]:
data.describe(include = "all")

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
count,4653,4653.0,4653,4653.0,4653.0,4653,4653,4653.0,4653.0
unique,3,,3,,,2,2,,
top,Bachelors,,Bangalore,,,Male,No,,
freq,3601,,2228,,,2778,4175,,
mean,,2015.06297,,2.698259,29.393295,,,2.905652,0.343864
std,,1.863377,,0.561435,4.826087,,,1.55824,0.475047
min,,2012.0,,1.0,22.0,,,0.0,0.0
25%,,2013.0,,3.0,26.0,,,2.0,0.0
50%,,2015.0,,3.0,28.0,,,3.0,0.0
75%,,2017.0,,3.0,32.0,,,4.0,1.0


In [5]:
print('Features with null values:\n', data.isnull().sum())

Features with null values:
 Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64


In [7]:
# Jiayi Zhang Logistic Regression
dflr = LogisticRegression(penalty='none',max_iter=10000)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.4, random_state=80)

dflr = dflr.fit(Xtrain,ytrain)

print(f"Intercept {dflr.intercept_} \nCoefficients: {dflr.coef_}")

ypr = dflr.predict(Xtest)

# Calculate the accuracy(Score),True Positives (TP), True Negatives (TN), False Negatives (FN), and False Positives (FP) 
tp = sum(np.logical_and(ypr == dflr.classes_[1], ytest == dflr.classes_[1]))
tn = sum(np.logical_and(ypr == dflr.classes_[0], ytest == dflr.classes_[0]))
fp = sum(np.logical_and(ypr == dflr.classes_[1], ytest == dflr.classes_[0]))
fn = sum(np.logical_and(ypr == dflr.classes_[0], ytest == dflr.classes_[1]))
acc = (tp + tn) / (tp + tn + fp + fn)
print(" tp:", tp, "\n","tn:",tn, "\n", "fp:", fp, "\n", "fn:", fn, "\n", "Accuracy(Score):", acc )

# When test size is 0.3
Xtrain3, Xtest3, ytrain3, ytest3 = train_test_split(X, y, test_size=0.3, random_state=80)
dflr3 = dflr.fit(Xtrain3,ytrain3)
ypr3 = dflr3.predict(Xtest3)
# Calculate the accuracy(Score),True Positives (TP), True Negatives (TN), False Negatives (FN), and False Positives (FP) 
tp3 = sum(np.logical_and(ypr3 == dflr3.classes_[1], ytest3 == dflr3.classes_[1]))
tn3 = sum(np.logical_and(ypr3 == dflr3.classes_[0], ytest3 == dflr3.classes_[0]))
fp3 = sum(np.logical_and(ypr3 == dflr3.classes_[1], ytest3 == dflr3.classes_[0]))
fn3 = sum(np.logical_and(ypr3 == dflr3.classes_[0], ytest3 == dflr3.classes_[1]))
acc3 = (tp3 + tn3) / (tp3 + tn3 + fp3 + fn3)
print(" tp:", tp3, "\n","tn:",tn3, "\n", "fp:", fp3, "\n", "fn:", fn3, "\n", "Accuracy(Score):", acc3 )

#plot confusion metrix for all data
yall = dflr.predict(X)
conf = confusion_matrix(yall,y)
ConfusionMatrixDisplay(conf).plot()
plt.show()

#plot confusion metrix for test data
yte = dflr.predict(Xtest)
conf = confusion_matrix(yte,ytest)
ConfusionMatrixDisplay(conf).plot()
plt.show()

#plot confusion metrix for train data
ytr = dflr.predict(Xtrain)
conf = confusion_matrix(ytr,ytrain)
ConfusionMatrixDisplay(conf).plot()
plt.show()

In [None]:
# Jiayi Zhang KNN
knn = KNeighborsClassifier(n_neighbors = 14)
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.4, random_state=80)

knn = knn.fit(Xtrain,ytrain)

ypr = dflr.predict(Xtest)

# Calculate the accuracy(Score),True Positives (TP), True Negatives (TN), False Negatives (FN), and False Positives (FP) 
tp = sum(np.logical_and(ypr == dflr.classes_[1], ytest == dflr.classes_[1]))
tn = sum(np.logical_and(ypr == dflr.classes_[0], ytest == dflr.classes_[0]))
fp = sum(np.logical_and(ypr == dflr.classes_[1], ytest == dflr.classes_[0]))
fn = sum(np.logical_and(ypr == dflr.classes_[0], ytest == dflr.classes_[1]))
acc = (tp + tn) / (tp + tn + fp + fn)
print(" tp:", tp, "\n","tn:",tn, "\n", "fp:", fp, "\n", "fn:", fn, "\n", "Accuracy(Score):", acc )

# When test size is 0.3
Xtrain3, Xtest3, ytrain3, ytest3 = train_test_split(X, y, test_size=0.3, random_state=80)
dflr3 = knn.fit(Xtrain3,ytrain3)
ypr3 = knn.predict(Xtest3)
# Calculate the accuracy(Score),True Positives (TP), True Negatives (TN), False Negatives (FN), and False Positives (FP) 
tp3 = sum(np.logical_and(ypr3 == dflr3.classes_[1], ytest3 == dflr3.classes_[1]))
tn3 = sum(np.logical_and(ypr3 == dflr3.classes_[0], ytest3 == dflr3.classes_[0]))
fp3 = sum(np.logical_and(ypr3 == dflr3.classes_[1], ytest3 == dflr3.classes_[0]))
fn3 = sum(np.logical_and(ypr3 == dflr3.classes_[0], ytest3 == dflr3.classes_[1]))
acc3 = (tp3 + tn3) / (tp3 + tn3 + fp3 + fn3)
print(" tp:", tp3, "\n","tn:",tn3, "\n", "fp:", fp3, "\n", "fn:", fn3, "\n", "Accuracy(Score):", acc3 )

#plot confusion metrix for all data
yall = knn.predict(X)
conf = confusion_matrix(yall,y)
ConfusionMatrixDisplay(conf).plot()
plt.show()

#plot confusion metrix for test data
yte = knn.predict(Xtest)
conf = confusion_matrix(yte,ytest)
ConfusionMatrixDisplay(conf).plot()
plt.show()

#plot confusion metrix for train data
ytr = knn.predict(Xtrain)
conf = confusion_matrix(ytr,ytrain)
ConfusionMatrixDisplay(conf).plot()
plt.show()