## Predicting Employee Attrition using Azure ML Services
A classification exercise predicting how likely an employee is to leave the company. Based on the [IBM employee data set](https://www.ibm.com/communities/analytics/watson-analytics-blog/hr-employee-attrition/) 

### Configure Azure ML Workspace

In [65]:
%matplotlib inline

import azureml.core
import pandas as pd
import numpy as np
from azureml.core.workspace import Workspace
import logging
import os

In [16]:
ws = Workspace.from_config()
# choose a name for the run history container in the workspace
experiment_name = 'hr-employee-attrition'
# project folder
project_folder = './hr-employee-attrition'

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Project Directory'] = project_folder
pd.set_option('display.max_colwidth', -1)
pd.DataFrame(data=output, index=['']).T

Found the config file in: C:\GitRepos\azure-data-services\wip\hr-employee-attrition\config.json


Unnamed: 0,Unnamed: 1
SDK version,1.0.17
Subscription ID,eb489f58-b7f7-4001-a82b-bc18cac70b3b
Workspace,bdl-aml-demo
Resource Group,bdl-cognitive-services
Location,eastus2
Project Directory,./hr-employee-attrition


### Load Data

In [13]:
import azureml.dataprep as dprep
dflow = dprep.auto_read_file(path="WA_Fn-UseC_-HR-Employee-Attrition.csv")
dflow.head(5)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41.0,True,Travel_Rarely,1102.0,Sales,1.0,2.0,Life Sciences,1.0,1.0,...,1.0,80.0,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0
1,49.0,False,Travel_Frequently,279.0,Research & Development,8.0,1.0,Life Sciences,1.0,2.0,...,4.0,80.0,1.0,10.0,3.0,3.0,10.0,7.0,1.0,7.0
2,37.0,True,Travel_Rarely,1373.0,Research & Development,2.0,2.0,Other,1.0,4.0,...,2.0,80.0,0.0,7.0,3.0,3.0,0.0,0.0,0.0,0.0
3,33.0,False,Travel_Frequently,1392.0,Research & Development,3.0,4.0,Life Sciences,1.0,5.0,...,3.0,80.0,0.0,8.0,3.0,3.0,8.0,7.0,3.0,0.0
4,27.0,False,Travel_Rarely,591.0,Research & Development,2.0,1.0,Medical,1.0,7.0,...,4.0,80.0,1.0,6.0,3.0,3.0,2.0,2.0,2.0,2.0


### Clean & Engineer Data

In [14]:
dflow.get_profile()

Unnamed: 0,Type,Min,Max,Count,Missing Count,Not Missing Count,Percent missing,Error Count,Empty count,0.1% Quantile,1% Quantile,5% Quantile,25% Quantile,50% Quantile,75% Quantile,95% Quantile,99% Quantile,99.9% Quantile,Mean,Standard Deviation,Variance,Skewness,Kurtosis
Age,FieldType.DECIMAL,18,60,1470.0,0.0,1470.0,0.0,0.0,0.0,18.0,26.0,26.0,30.0,35.7458,43.0,54.1169,58.5467,60.0,36.9238,9.13537,83.455,0.412443,-0.410378
Attrition,FieldType.BOOLEAN,False,True,1470.0,0.0,1470.0,0.0,0.0,0.0,,,,,,,,,,,,,,
BusinessTravel,FieldType.STRING,Non-Travel,Travel_Rarely,1470.0,0.0,1470.0,0.0,0.0,0.0,,,,,,,,,,,,,,
DailyRate,FieldType.DECIMAL,102,1499,1470.0,0.0,1470.0,0.0,0.0,0.0,102.97,243.015,239.143,465.283,801.045,1158.78,1424.03,1485.32,1498.03,802.486,403.509,162820.0,-0.00351139,-1.20625
Department,FieldType.STRING,Human Resources,Sales,1470.0,0.0,1470.0,0.0,0.0,0.0,,,,,,,,,,,,,,
DistanceFromHome,FieldType.DECIMAL,1,29,1470.0,0.0,1470.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,7.0,14.0143,26.0,29.0,29.0,9.19252,8.10686,65.7213,0.956164,-0.231918
Education,FieldType.DECIMAL,1,5,1470.0,0.0,1470.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,3.0,4.0,4.0,5.0,5.0,2.91293,1.02416,1.04891,-0.28909,-0.564611
EducationField,FieldType.STRING,Human Resources,Technical Degree,1470.0,0.0,1470.0,0.0,0.0,0.0,,,,,,,,,,,,,,
EmployeeCount,FieldType.DECIMAL,1,1,1470.0,0.0,1470.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,
EmployeeNumber,FieldType.DECIMAL,1,2068,1470.0,0.0,1470.0,0.0,0.0,0.0,1.97,198.278,193.5,490.067,1021.16,1556.6,1967.67,2047.44,2065.09,1024.87,602.024,362433.0,0.0165402,-1.22552


### Prep and Split Data

In [48]:
## Keep all features except the predicted status ("Attrition")
predictors_df = dflow.drop_columns(["Attrition"]).to_pandas_dataframe()
target_df = dflow.keep_columns(["Attrition"]).to_pandas_dataframe()

In [25]:
## split into test/train sets
from sklearn.model_selection import train_test_split

predictors_df = predictors_df.to_pandas_dataframe()
target_df = target_df.to_pandas_dataframe()

predictors_train, predictors_test, target_train, target_test = train_test_split(predictors_df, target_df, test_size=0.2, random_state=223)

In [31]:
target_train.describe()

Unnamed: 0,Attrition
count,1176
unique,2
top,False
freq,993


### Automated Training / Testing 50 Different Models

In [52]:
from azureml.train.automl import AutoMLConfig

automl_classifier_config = AutoMLConfig(
    task="classification",
    primary_metric='AUC_weighted',
    
    iteration_timeout_minutes = 15,
    iterations = 50,
    X=predictors_train,
    y=target_train.values.flatten(),
    n_cross_validations = 5,
    preprocess=True
)

In [53]:
from azureml.core.experiment import Experiment
experiment=Experiment(ws, experiment_name)
local_run = experiment.submit(automl_classifier_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_4563a6c2-b74c-40a5-be3f-c2926aa07fc3
********************************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
SAMPLING %: Percent of the training data to sample.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
********************************************************************************************************************

 ITERATION   PIPELINE                                       SAMPLING %  DURATION      METRIC      BEST
         0   SparseNormalizer LightGBM                      100.0000    0:00:16       0.7702    0.7702
         1   SparseNormalizer LightGBM                      100.0000    0:00:16       0.8135    0.8135
         2   StandardScalerWrapper LightGBM                 100

### Evaluate Model Results

In [55]:
best_run, fitted_model = local_run.get_output()
print(best_run)
print(fitted_model)

Run(Experiment: hr-employee-attrition,
Id: AutoML_4563a6c2-b74c-40a5-be3f-c2926aa07fc3_49,
Type: None,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(logger=None, task=None)), ('prefittedsoftvotingclassifier', PreFittedSoftVotingClassifier(classification_labels=None,
               estimators=[('LogisticRegression', Pipeline(memory=None,
     steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('logisticre....06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.2, 0.2, 0.2]))])
Y_transformer(['LabelEncoder', LabelEncoder()])


In [77]:
type(local_run)
type(fitted_model)

automl.client.core.common.model_wrappers.PipelineWithYTransformations

In [87]:
predictions = fitted_model.predict(predictors_df)
prediction_probabilities = fitted_model.predict_proba(predictors_df)

full_results_df = predictors_df
full_results_df['Attrition - Actual'] = target_df
full_results_df['Attrition - Predicted'] = predictions
full_results_df.head(5)

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition - Predicted,Attrition - Actual
0,41.0,Travel_Rarely,1102.0,Sales,1.0,2.0,Life Sciences,1.0,1.0,2.0,...,0.0,8.0,0.0,1.0,6.0,4.0,0.0,5.0,True,True
1,49.0,Travel_Frequently,279.0,Research & Development,8.0,1.0,Life Sciences,1.0,2.0,3.0,...,1.0,10.0,3.0,3.0,10.0,7.0,1.0,7.0,False,False
2,37.0,Travel_Rarely,1373.0,Research & Development,2.0,2.0,Other,1.0,4.0,4.0,...,0.0,7.0,3.0,3.0,0.0,0.0,0.0,0.0,True,True
3,33.0,Travel_Frequently,1392.0,Research & Development,3.0,4.0,Life Sciences,1.0,5.0,4.0,...,0.0,8.0,3.0,3.0,8.0,7.0,3.0,0.0,False,False
4,27.0,Travel_Rarely,591.0,Research & Development,2.0,1.0,Medical,1.0,7.0,1.0,...,1.0,6.0,3.0,3.0,2.0,2.0,2.0,2.0,False,False


In [88]:
full_results_df.to_csv("employee_attrition_with_prediction.csv")

In [72]:
type(predictions)

numpy.ndarray

### Publish as Webservice