<a href="https://colab.research.google.com/github/aserravalle/machine-learning-startup/blob/master/KNN_Classification_Employee_Performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup
Predict employee satisfaction and performance using KNN classification

In [0]:
# The Kaggle API client expects this file to be in ~/.kaggle,
# so move it there.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d pavansubhasht/ibm-hr-analytics-attrition-dataset

ibm-hr-analytics-attrition-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [0]:
import pandas as pd
import numpy as np

# Data cleaning and preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.neighbors import KNeighborsRegressor

# Options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

# Target variables: PerformanceRating, JobSatisfaction

# Pre-pre-processing

Need to get the data ready for a production set - remove the features that can only be found after the employee is hired. E.g. year's under current manager'

In [0]:
# Get the dataset that we want
df = pd.read_csv('/content/ibm-hr-analytics-attrition-dataset.zip')

# Drop the future rows
drop = 'Attrition,DailyRate,HourlyRate,MonthlyRate,EmployeeCount,EmployeeNumber,JobInvolvement,JobLevel,Over18,RelationshipSatisfaction,TrainingTimesLastYear'.split(',')
df.drop(drop, axis=1 , inplace=True)

# Turn education into a categorical variable
def MapEducation(x):
    Map = {1:'Below College',
        2:'College',
        3:'Bachelor',
        4:'Master',
        5:'Doctor'}
    return Map[x]
df['Education'] = df['Education'].map(lambda x: MapEducation(x))

# Save
df.to_csv('/content/train.csv',index = False)
print(df.shape)
df.head()

(1470, 24)


Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,StandardHours,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Travel_Rarely,Sales,1,College,Life Sciences,2,Female,Sales Executive,4,Single,5993,8,Yes,11,3,80,0,8,1,6,4,0,5
1,49,Travel_Frequently,Research & Development,8,Below College,Life Sciences,3,Male,Research Scientist,2,Married,5130,1,No,23,4,80,1,10,3,10,7,1,7
2,37,Travel_Rarely,Research & Development,2,College,Other,4,Male,Laboratory Technician,3,Single,2090,6,Yes,15,3,80,0,7,3,0,0,0,0
3,33,Travel_Frequently,Research & Development,3,Master,Life Sciences,4,Female,Research Scientist,3,Married,2909,1,Yes,11,3,80,0,8,3,8,7,3,0
4,27,Travel_Rarely,Research & Development,2,Below College,Medical,1,Male,Laboratory Technician,2,Married,3468,9,No,12,3,80,1,6,3,2,2,2,2


# Preprocessing

You must turn the OHE and Scaling into a pipeline object you can save and reproduce for the data they give you

In [0]:
df = pd.read_csv('/content/train.csv')

# Determine types of features
cat_features = [f for f in df.columns if (np.dtype(df[f]) == 'object')]
num_features = [f for f in df.columns if (np.dtype(df[f]) != 'object') & (f not in ['PerformanceRating', 'JobSatisfaction','EmployeeID'])]

# OHE categorical
df_cat = df.loc[:,cat_features]
enc = OneHotEncoder(handle_unknown='ignore').fit(df_cat)
df_cat = pd.DataFrame(enc.transform(df_cat).toarray(), columns = enc.get_feature_names(df_cat.columns))

# Standardise numerical columns
df_num = df.loc[:,num_features]
train = pd.concat([df_num, df_cat], axis = 1)
print('train shape = ',train.shape)
scaler = StandardScaler().fit(train)
train = pd.DataFrame(scaler.transform(train), columns = train.columns)

# Show
train.head()

train shape =  (1470, 47)


Unnamed: 0,Age,DistanceFromHome,EnvironmentSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,Education_Bachelor,Education_Below College,Education_College,Education_Doctor,Education_Master,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,EducationField_Other,EducationField_Technical Degree,Gender_Female,Gender_Male,JobRole_Healthcare Representative,JobRole_Human Resources,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,0.44635,-1.010909,-0.660531,-0.10835,2.125136,-1.150554,0.0,-0.932014,-0.421642,-2.49382,-0.164613,-0.063296,-0.679146,0.245834,-0.3371,-0.481859,0.639841,-0.211604,-1.374051,1.515244,-0.798105,-0.36162,2.052502,-0.183726,-0.609318,-0.136788,1.194045,-0.348255,-0.679141,-0.243059,-0.314093,1.224745,-1.224745,-0.312785,-0.191498,-0.462464,-0.273059,-0.330808,-0.239904,-0.497873,1.873287,-0.244625,-0.534873,-0.918921,1.45865,-1.591746,1.591746
1,1.322365,-0.14715,0.254625,-0.291719,-0.678049,2.129306,0.0,0.241988,-0.164511,0.338096,0.488508,0.764998,-0.368715,0.806541,-0.3371,2.075297,-1.562889,-0.211604,0.727775,-0.65996,-0.798105,2.765332,-0.48721,-0.183726,-0.609318,-0.136788,1.194045,-0.348255,-0.679141,-0.243059,-0.314093,-0.816497,0.816497,-0.312785,-0.191498,-0.462464,-0.273059,-0.330808,-0.239904,2.008543,-0.533821,-0.244625,-0.534873,1.088232,-0.685565,0.628241,-0.628241
2,0.008343,-0.887515,1.169781,-0.937654,1.324226,-0.057267,0.0,-0.932014,-0.550208,0.338096,-1.144294,-1.167687,-0.679146,-1.155935,-0.3371,-0.481859,0.639841,-0.211604,0.727775,-0.65996,-0.798105,-0.36162,2.052502,-0.183726,-0.609318,-0.136788,-0.83749,-0.348255,-0.679141,4.114223,-0.314093,-0.816497,0.816497,-0.312785,-0.191498,2.162331,-0.273059,-0.330808,-0.239904,-0.497873,-0.533821,-0.244625,-0.534873,-0.918921,1.45865,-1.591746,1.591746
3,-0.429664,-0.764121,1.169781,-0.763634,-0.678049,-1.150554,0.0,-0.932014,-0.421642,0.338096,0.161947,0.764998,0.252146,-1.155935,-0.3371,2.075297,-1.562889,-0.211604,0.727775,-0.65996,-0.798105,-0.36162,-0.48721,-0.183726,1.641179,-0.136788,1.194045,-0.348255,-0.679141,-0.243059,-0.314093,1.224745,-1.224745,-0.312785,-0.191498,-0.462464,-0.273059,-0.330808,-0.239904,2.008543,-0.533821,-0.244625,-0.534873,1.088232,-0.685565,-1.591746,1.591746
4,-1.086676,-0.887515,-1.575686,-0.644858,2.525591,-0.877232,0.0,0.241988,-0.678774,0.338096,-0.817734,-0.615492,-0.058285,-0.595227,-0.3371,-0.481859,0.639841,-0.211604,0.727775,-0.65996,-0.798105,2.765332,-0.48721,-0.183726,-0.609318,-0.136788,-0.83749,-0.348255,1.472448,-0.243059,-0.314093,-0.816497,0.816497,-0.312785,-0.191498,2.162331,-0.273059,-0.330808,-0.239904,-0.497873,-0.533821,-0.244625,-0.534873,1.088232,-0.685565,0.628241,-0.628241


# Model Building

In [0]:
# Create a copy of the target, ID, and predictor variables
y1 = df[['JobSatisfaction']]
y2 = df[['PerformanceRating']]

# Train two models
KNN_y1 = KNeighborsRegressor(n_neighbors=50, weights='distance').fit(train, y1)
KNN_y2 = KNeighborsRegressor(n_neighbors=50, weights='distance').fit(train, y2)

KNN_y1

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=50, p=2,
                    weights='distance')

# Return Prediction

In [0]:
# Return prediction with Employee ID
predJS = pd.DataFrame(KNN_y1.predict(train), columns = ['JS'])
predPR = pd.DataFrame(KNN_y2.predict(train), columns = ['PR'])
output = pd.concat([predJS, predPR], axis = 1)

output.head()

Unnamed: 0,JS,PR
0,4.0,3.0
1,2.0,4.0
2,3.0,3.0
3,3.0,3.0
4,2.0,3.0


# Save Model

In [0]:
import pickle
from sklearn.externals import joblib
import json

# Save model, OHE, and scaler
joblib.dump(KNN_y1, 'model_jobsat.pkl') 
joblib.dump(KNN_y2, 'model_perform.pkl') 
joblib.dump(scaler, 'std_scaler.bin', compress=True)
joblib.dump(enc, 'ohe.bin', compress=True)

# Save example obs without the target variables
x_test = df.loc[1,:]
x_test.drop(['PerformanceRating','JobSatisfaction'], inplace = True)
x_test.to_json('xobs.json')
print(x_test.shape)

(22,)


In [0]:
    # Load, pipeline, and make predictions on Xobs
    model1 = joblib.load('model_jobsat.pkl')
    model2 = joblib.load('model_perform.pkl')
    sc = joblib.load('std_scaler.bin')
    ohe = joblib.load('ohe.bin')

    with open('xobs.json', 'r') as f:
        data = json.load(f)
    xobs = pd.DataFrame(data, index = [0])

    # Preprocess the example observation
    cat_features = [f for f in xobs.columns if (np.dtype(xobs[f]) == 'object')]
    num_features = [f for f in xobs.columns if (np.dtype(xobs[f]) != 'object')]

    # OHE categorical
    df_cat2 = xobs.loc[:,cat_features]
    df_cat2 = pd.DataFrame(ohe.transform(df_cat2).toarray(), columns = ohe.get_feature_names(df_cat2.columns))

    # Standardise numerical columns
    df_num2 = xobs.loc[:,num_features]
    test = pd.concat([df_num2, df_cat2], axis = 1)
    print('train shape = ',test.shape)
    test = pd.DataFrame(sc.transform(test), columns = test.columns)

    # Prediction
    predJS = str(model1.predict(test)[0][0])
    predPR = str(model2.predict(test)[0][0])

    print(predJS, predPR)

train shape =  (1, 47)
2.0 3.0


In [0]:
print(len(cat_features))
print(xobs.loc[:,cat_features].shape)

xobs.loc[:,cat_features]

8
(1, 8)


Unnamed: 0,BusinessTravel,Department,Education,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Travel_Rarely,Sales,Master,Marketing,Male,Sales Executive,Married,No


# Putting it into Production

1.   ~~Fully trained and saved model~~
2.   ~~Quick api.py app and test~~
1.   Front end landing page from [Flaskex](https://github.com/anfederico/Flaskex) or [Flask School API](https://github.com/mbithenzomo/flask-school-app-and-api)
1.   Firebase to store the customer's data (incl. payment key)
1.   Stripe so they can pay
1.   Paywall the prediction function so only customers that have paid can call 
1.   Upload to a github repo once it is fully functional on local machine
1.   Render to make your app live. Simply push the repository from Github

In [0]:
a = '''Age = int(request.form['Age'])
BusinessTravel = request.form['BusinessTravel']
Department = request.form['Department']
DistanceFromHome = int(request.form['DistanceFromHome'])
Education = request.form['Education']
EducationField = request.form['EducationField']
EnvironmentSatisfaction = request.form['EnvironmentSatisfaction']
Gender = request.form['Gender']
JobRole = request.form['JobRole']
MaritalStatus = request.form['MaritalStatus']
MonthlyIncome = int(request.form['MonthlyIncome'])
NumCompaniesWorked = int(request.form['NumCompaniesWorked'])
OverTime = request.form['OverTime']
PercentSalaryHike = int(request.form['PercentSalaryHike'])
StandardHours = int(request.form['StandardHours'])
StockOptionLevel_0to3 = int(request.form['StockOptionLevel_0to3'])
TotalWorkingYears = int(request.form['TotalWorkingYears'])
WorkLifeBalance_1to5 = int(request.form['WorkLifeBalance_1to5'])
YearsAtCompany = int(request.form['YearsAtCompany'])
YearsInCurrentRole = int(request.form['YearsInCurrentRole'])
YearsSinceLastPromotion = int(request.form['YearsSinceLastPromotion'])
YearsWithCurrManager = int(request.form['YearsWithCurrManager'])'''

a = a.split('\n')
for i in range(len(a)):
    print(a[i].split('=')[0][:-1],',')

Age ,
BusinessTravel ,
Department ,
DistanceFromHome ,
Education ,
EducationField ,
EnvironmentSatisfaction ,
Gender ,
JobRole ,
MaritalStatus ,
MonthlyIncome ,
NumCompaniesWorked ,
OverTime ,
PercentSalaryHike ,
StandardHours ,
StockOptionLevel_0to3 ,
TotalWorkingYears ,
WorkLifeBalance_1to5 ,
YearsAtCompany ,
YearsInCurrentRole ,
YearsSinceLastPromotion ,
YearsWithCurrManager ,


In [0]:
a = """Age ,
BusinessTravel ,
Department ,
DistanceFromHome ,
Education ,
EducationField ,
EnvironmentSatisfaction ,
Gender ,
JobRole ,
MaritalStatus ,
MonthlyIncome ,
NumCompaniesWorked ,
OverTime ,
PercentSalaryHike ,
StandardHours ,
StockOptionLevel_0to3 ,
TotalWorkingYears ,
WorkLifeBalance_1to5 ,
YearsAtCompany ,
YearsInCurrentRole ,
YearsSinceLastPromotion ,
YearsWithCurrManager ,"""

a = a.split('\n')
for i in a:
    print("'"+i[:-2]+"',")


'Age',
'BusinessTravel',
'Department',
'DistanceFromHome',
'Education',
'EducationField',
'EnvironmentSatisfaction',
'Gender',
'JobRole',
'MaritalStatus',
'MonthlyIncome',
'NumCompaniesWorked',
'OverTime',
'PercentSalaryHike',
'StandardHours',
'StockOptionLevel_0to3',
'TotalWorkingYears',
'WorkLifeBalance_1to5',
'YearsAtCompany',
'YearsInCurrentRole',
'YearsSinceLastPromotion',
'YearsWithCurrManager',


In [0]:
'Age','BusinessTravel','Department','DistanceFromHome','Education','EducationField','EnvironmentSatisfaction','Gender','JobRole','MaritalStatus','MonthlyIncome','NumCompaniesWorked','OverTime','PercentSalaryHike','StandardHours','StockOptionLevel_0to3','TotalWorkingYears','WorkLifeBalance_1to5','YearsAtCompany', 'YearsInCurrentRole','YearsSinceLastPromotion', 'YearsWithCurrManager',

0    9
Name: YearsAtCompany, dtype: int64