## Project Delivery

In [1]:
# print_function for compatibility with Python 3
from __future__ import print_function
print('Print is ready to serve')

# NumPy for numerical computing
import numpy as np

# Pandas for DataFrames
import pandas as pd

# Pickle for reading model files
import pickle

# Scikit-Learn for Modeling
import sklearn
from sklearn.model_selection import train_test_split

Print is ready to serve


In [2]:
# Area under ROC curve
from sklearn.metrics import roc_auc_score

In [3]:
# Load final_model.pkl as model
with open('final_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [4]:
# Display model object
model

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.33, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=123,
            verbose=0, warm_start=False))])

In [18]:
df = pd.read_csv('analytical_base_table.csv')

In [19]:
df.columns

Index([u'Unnamed: 0', u'avg_monthly_hrs', u'filed_complaint',
       u'last_evaluation', u'n_projects', u'recently_promoted',
       u'satisfaction', u'status', u'tenure', u'last_evaluation_missing',
       u'underperformer', u'unhappy', u'overachiever', u'department_IT',
       u'department_Missing', u'department_admin', u'department_engineering',
       u'department_finance', u'department_management',
       u'department_marketing', u'department_procurement',
       u'department_product', u'department_sales', u'department_support',
       u'salary_high', u'salary_low', u'salary_medium'],
      dtype='object')

In [21]:
col_0 = df.columns[0]

In [22]:
col_0

'Unnamed: 0'

In [23]:
# drop the unnamed column
df = df.drop(col_0, axis=1)

In [24]:
df.columns

Index([u'avg_monthly_hrs', u'filed_complaint', u'last_evaluation',
       u'n_projects', u'recently_promoted', u'satisfaction', u'status',
       u'tenure', u'last_evaluation_missing', u'underperformer', u'unhappy',
       u'overachiever', u'department_IT', u'department_Missing',
       u'department_admin', u'department_engineering', u'department_finance',
       u'department_management', u'department_marketing',
       u'department_procurement', u'department_product', u'department_sales',
       u'department_support', u'salary_high', u'salary_low', u'salary_medium'],
      dtype='object')

In [25]:
print(len(df.columns))

26


In [12]:
# Create separate object for target variable
y = df.status

# Create separate object for input features
X = df.drop('status', axis=1)

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,
                                                    random_state = 1234,
                                                    stratify=df.status)


In [13]:
# Predict X_test

pred = model.predict_proba(X_test)

# Get just the prediction for the postive class (1)
pred = [p[1] for p in pred]

# Print AUROC
print('AUROC: ', roc_auc_score(y_test, pred))

AUROC:  0.991520189216


## Construct Model Class

In [14]:
class EmployeeRetentionModel:
    
    def __init__(self, model_location):
        with open(model_location, 'rb') as f:
            self.model = pickle.load(f)
    
    def predict_proba(self, X_new, clean=True, augment=True):
        if clean:
            X_new = self.clean_data(X_new)
        
        if augment:
            X_new = self.engineer_features(X_new)
        
        return X_new, self.model.predict_proba(X_new)
    
    # Add functions here
    def clean_data(self, df):
        # Drop duplicates
        df = df.drop_duplicates()

        # Drop temporary workers
        df = df[df.department != 'temp']

        # Missing filed_complaint values should be 0
        df['filed_complaint'] = df.filed_complaint.fillna(0)

        # Missing recently_promoted values should be 0
        df['recently_promoted'] = df.recently_promoted.fillna(0)

        # 'information_technology' should be 'IT'
        df.department.replace('information_technology', 'IT', inplace=True)

        # Fill missing values in department with 'Missing'
        df['department'].fillna('Missing', inplace=True)

        # Indicator variable for missing last_evaluation
        df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)

        # Fill missing values in last_evaluation with 0
        df.last_evaluation.fillna(0, inplace=True)

        # Return cleaned dataframe
        return df
    
    def engineer_features(self, df):
        # Create indicator features
        df['underperformer'] = ((df.last_evaluation < 0.6) & 
                                (df.last_evaluation_missing == 0)).astype(int)

        df['unhappy'] = (df.satisfaction < 0.2).astype(int)

        df['overachiever'] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(int)

        # Create new dataframe with dummy features
        df = pd.get_dummies(df, columns=['department', 'salary'])

        # Return augmented DataFrame
        return df    

In [15]:
# Initialize an instance
retention_model = EmployeeRetentionModel('final_model.pkl')

In [26]:
# Predict raw data
#_, pred1 = retention_model.predict_proba(raw_data, clean=True, augment=True)

# Predict cleaned data
#_, pred2 = retention_model.predict_proba(cleaned_data, clean=False, augment=True)

# Predict cleaned and augmented data
augmented_data = X
_, pred3 = retention_model.predict_proba(augmented_data, clean=False, augment=False)

In [27]:
pred3[:5]

array([[ 0.04,  0.96],
       [ 1.  ,  0.  ],
       [ 1.  ,  0.  ],
       [ 0.98,  0.02],
       [ 1.  ,  0.  ]])