# Employee Turnover Predictions

# Problem: 
### The employee turnover 


The employee turnover problem is one of the most common problems at work. As per the Center of American progress, the cost of replacing an employee is 20% of that worker's yearly income.

In [1]:
#import your libraries
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

__author__ = "Ajay Mishra"
__email__ = "ajaykmishra2000@gmail.com"

## Part 2 - DISCOVER

### ---- Reading the data ----

In [None]:
df = pd.read_csv('Employee Turnover/data/HR-data.csv')
df.shape

In [2]:
df = pd.read_csv('Employee Turnover/data/HR-data.csv')
df.shape

FileNotFoundError: File b'Employee Turnover/data/HR-data.csv' does not exist

In [None]:
# examine the data
df.head()

In [None]:
df['sales'].value_counts()

### ---- Data Cleaning ----

In [None]:
# checking for missing value in the dataset
df.isnull().any()

In [None]:
df.columns

In [None]:
# rename columns
df = df.rename(columns = {'satisfaction_level':'satisfaction',
                         'last_evaluation':'evaluation',
                         'number_project':'projectCount',
                         'average_montly_hours':'AvgMonthlyHours',
                         'time_spend_company':'yearsAtCompany',
                         'Work_accident':'accidentAtWork',
                         'left':'turnover',
                         'promotion_last_5years':'promotion',
                         'sales':'department'})
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
#look for duplicate data, invalid data (e.g. salaries <=0), or corrupt data and remove it
df.duplicated().sum()

In [None]:
#df = df.drop_duplicates().reset_index(drop=True)
#df.head()

### ---- EDA ----

In [None]:
#summarize each feature variable
#summarize the target variable
#look for correlation between each feature and the target
#look for correlation between features

In [None]:
df['turnover'].value_counts()

In [None]:
# calculate turnover rates
turnover_rate = df.turnover.value_counts()/df.shape[0]
turnover_rate

In [None]:
# statistical overview of dataset
df.describe()

In [None]:
trunover_Summary=df.groupby('turnover')
trunover_Summary.agg('mean')

In [None]:
trunover_Summary.std()

### ---- Correlation Matrix ----

In [None]:
corr = df.corr()
sns.heatmap(corr,
           xticklabels=corr.columns.values,
           yticklabels=corr.columns.values,
           cmap='BuPu',
           linewidth=2)
plt.title('Heatmap employee turnover')
corr

In [None]:
corr.columns.values

### --- Distribution of features ---

In [None]:
fig = plt.figure(figsize=(15,8))

# satisfaction distribution
ax1 = fig.add_subplot(241)
df.hist(column='satisfaction',ax=ax1, color='limegreen')
ax1.set_xlabel('satisfaction')
ax1.set_title('satisfaction distribution')

# evaluation distribution
ax2 = fig.add_subplot(242)
df.hist(column='evaluation',ax=ax2, color='royalblue')
ax2.set_xlabel('evaluation')
ax2.set_title('evaluation distribution')

# projectCount distribution
ax3 = fig.add_subplot(243)
df.hist(column='projectCount',ax=ax3, color='turquoise')
ax3.set_xlabel('projectCount')
ax3.set_title('projectCount distribution')

# AvgMonthlyHours distribution
ax4 = fig.add_subplot(244)
df.hist(column='AvgMonthlyHours',ax=ax4, color='slateblue')
ax4.set_xlabel('AvgMonthlyHours')
ax4.set_title('AvgMonthlyHours distribution')

# yearsAtCompany distribution
ax5 = fig.add_subplot(245)
df.hist(column='yearsAtCompany',ax=ax5, color='lightcoral')
ax5.set_xlabel('yearsAtCompany')
ax5.set_title('yearsAtCompany distribution')

# accidentAtWork distribution
ax6 = fig.add_subplot(246)
df.hist(column='accidentAtWork',ax=ax6, color='teal')
ax6.set_xlabel('accidentAtWork')
ax6.set_title('accidentAtWork distribution')

# turnover distribution
ax7 = fig.add_subplot(247)
df.hist(column='turnover',ax=ax7, color='plum')
ax7.set_xlabel('turnover')
ax7.set_title('turnover distribution')

# promotion distribution
ax8 = fig.add_subplot(248)
df.hist(column='promotion',ax=ax8, color='olive')
ax8.set_xlabel('promotion')
ax8.set_title('promotion distribution')

plt.tight_layout()
plt.show()

In [None]:
sns.lmplot(x='satisfaction',y='evaluation',data=df,
          fit_reg=False,
          hue='turnover')
plt.title('satisfaction Vs evaluation')

In [None]:
sns.lmplot(x='evaluation',y='AvgMonthlyHours',data=df,
          fit_reg=False,
          hue='turnover')
plt.title('evaluation Vs AvgMonthlyHours')

In [None]:
sns.lmplot(x='satisfaction',y='AvgMonthlyHours',data=df,
          fit_reg=False,
          hue='turnover')
plt.title('satisfaction Vs AvgMonthlyHours')

### --- Clustering employee turnover ---

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3,random_state=2)
kmeans.fit(df[df.turnover==1][['satisfaction','evaluation']])
kmeans.labels_
#kmeans.inertia_
#kmeans.n_iter_
#kmeans.cluster_centers_ 
kmeans_color = ['green' if c == 0 else 'blue' if c == 2 else 'red' for c in kmeans.labels_]

fig = plt.figure(figsize=(11,6))
plt.scatter(x='satisfaction',y='evaluation', data=df[df['turnover']==1],
           alpha=0.25, color=kmeans_color)
plt.xlabel('Satisfaction')
plt.ylabel('Evaluation')
plt.scatter(x=kmeans.cluster_centers_[:,0],y=kmeans.cluster_centers_[:,1],marker='X',color='black',s=100)
plt.title('Cluster of employee turnover')
#plt.tight_layout()
plt.show()

In [None]:
len(kmeans.labels_)

In [None]:
fig = plt.figure(figsize=(15,4))
ax=sns.kdeplot(df[df['turnover']==0]['satisfaction'],color='b', shade=True, label='no turnover')
ax=sns.kdeplot(df[df['turnover']==1]['satisfaction'],color='r', shade=True, label='turnover')
plt.title('Employee Satisfaction Distribution: Turnover Vs No Turnover')

### --- Project Count ---

In [None]:
fig,ax = plt.subplots(figsize=(10,4))
sns.barplot(x='projectCount',y='projectCount', hue='turnover',data=df, estimator=lambda x: len(x) / len(df) * 100)
ax.set(xlabel='# of Projects', ylabel = 'Counts',
      title='Project Count Distribution: Turnover Vs No Turnover')
sns.despine()


### ---- Turnover by Department   ----

In [None]:
df[df['turnover']==1]['department'].value_counts()

In [None]:
emp_left = df[df['turnover']==1]
emp_left = pd.DataFrame(emp_left.department.value_counts()).reset_index()
emp_left
emp_total = pd.DataFrame(df.department.value_counts()).reset_index()
emp_total

employee_df = pd.merge(emp_total,emp_left,how='inner',on='index')
employee_df = employee_df.rename(columns = {'index':'department','department_x':'EmployeeTotal','department_y':'EmployeeLeft'})
employee_df

In [None]:
sns.set(style = "whitegrid")
f, ax = plt.subplots(figsize=(12,5))

sns.set_color_codes("pastel")
sns.barplot(x='EmployeeTotal',y='department',data=employee_df,
            label='Total', color='lightskyblue')

sns.set_color_codes("muted")
sns.barplot(x='EmployeeLeft',y='department',data=employee_df,
            label='left',color='r')

ax.legend(ncol=2, loc="lower right",frameon=True)
ax.set(xlabel='Total Employees',ylabel='Departments',
      title='Employees per Department')
sns.despine(left=True, bottom=True)

### ---- Average Monthly Hours ----

In [None]:
f, ax = plt.subplots(figsize=(15,4))
ax = sns.kdeplot(df[df['turnover']==0]['AvgMonthlyHours'],color='b',shade=True, label='no turnover')
ax = sns.kdeplot(df[df['turnover']==1]['AvgMonthlyHours'],color='r',shade=True, label='turnover')

ax.set(xlabel='Average Monthly Hours', ylabel='Frequency',
      title='Average Monthly Hours Distribution: Turnover Vs No Turnover')
sns.despine()

### ---- Preprocessing ----

In [None]:
df['salary'].value_counts()

In [None]:
cat_var = ['turnover','department','salary','promotion']
num_var = ['satisfaction','evaluation','projectCount','AvgMonthlyHours','yearsAtCompany','accidentAtWork']
df_cat = pd.get_dummies(df[cat_var],drop_first=True)
df_num = df[num_var]
new_df = pd.concat([df_cat,df_num], axis=1)
new_df.head()

### ---- Class Balance ----

In [None]:
turnover_rate = new_df['turnover'].value_counts()/new_df.shape[0]
turnover_rate

In [None]:
#turnover_rate.values
#turnover_rate.index

In [None]:
f,ax = plt.subplots(figsize=(9,5))
sns.barplot(x=turnover_rate.index,y=turnover_rate.values,data=new_df)
ax.set(xlabel='Employee Turnover',ylabel='Count',
      title='Employee Turnover Distribution')
sns.despine()


### ---- Split dataset: Training / Test ----

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,precision_score, recall_score, confusion_matrix, precision_recall_curve 

X = new_df.iloc[:,1:]
y = new_df.iloc[:,0]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123,stratify=y)

### ---- Resample data ----

In [None]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE 

In [None]:
#upsample Minority class
X_train_up, y_train_up = resample(X_train[y_train == 1],
                                y_train[y_train == 1],
                                replace=True,
                                n_samples=X_train[y_train == 0].shape[0],
                                random_state=1)

X_train_up = np.concatenate((X_train[y_train==0],X_train_up))
y_train_up = np.concatenate((y_train[y_train==0],y_train_up))

# Upscale using SMOTE

sm = SMOTE(random_state=12, ratio=1.0)
X_train_sm,y_train_sm = sm.fit_sample(X_train,y_train)

#downsample Majority class
X_train_dn, y_train_dn = resample(X_train[y_train == 0],
                                y_train[y_train == 0],
                                replace=True,
                                n_samples=X_train[y_train == 1].shape[0],
                                random_state=1)

X_train_dn = np.concatenate((X_train[y_train==1],X_train_dn))
y_train_dn = np.concatenate((y_train[y_train==1],y_train_dn))


print('Original Shape ====>',X_train.shape,y_train.shape)

print('Upsample Shape ====>',X_train_up.shape,y_train_up.shape)

print('SMOTE Shape    ====>',X_train_sm.shape,y_train_sm.shape)

print('Downsample Shape ==>',X_train_dn.shape,y_train_dn.shape)


### ---- Chossing best Sampling technique ----    

#### The Best sampling method is SMOTE

#### - Trained logistic regression on all sampled datasets

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

resample_methods = {'Original':(X_train,y_train),
                   'Upsampled':(X_train_up,y_train_up),
                   'SMOTE':(X_train_sm,y_train_sm),
                   'Downsampled':(X_train_dn,y_train_dn)}

for method in resample_methods.keys():
    #print(method)
    lr_results = cross_val_score(LogisticRegression(solver='liblinear'),resample_methods[method][0],resample_methods[method][1],cv=5,scoring='f1')
    print('The best F1 score {method} data',method,'method' ,'=====>',lr_results.mean())
    
cross_val_score(LogisticRegression(class_weight='balanced'),X_train,y_train,cv=5,scoring='f1').mean()
#cross_val_score(LogisticRegression(class_weight='balanced'), X_train, y_train, cv=5, scoring='f1').mean()

### ---- Train models ----


- Logistic Regression
- Random Forest
- Gradient Boosting Classification

#### Logistic Regression f1 score: 0.78

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

lr = LogisticRegression()
lr = lr.fit(X_train_sm,y_train_sm)

lr.predict(X_test)
lr_auc = roc_auc_score(y_test,lr.predict(X_test))

print('Logistic regression AUC = %2.2f' % lr_auc)

print(classification_report(y_test,lr.predict(X_test)))

#### Random Forest 

- 5 fold cross validation on Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf_result = cross_val_score(rf,X_train_sm,y_train_sm,cv=5,scoring='f1')
rf_result.mean()

- Random Forest F1 score: 0.99

In [None]:
rf = RandomForestClassifier(n_estimators=100)

rf = rf.fit(X_train_sm,y_train_sm)

rf_auc = roc_auc_score(y_test,rf.predict(X_test))

print('Random Forest AUC is %2.2f'% rf_auc)

print(classification_report(y_test,rf.predict(X_test)))


#### Gradient Boosting Classifier

#### Gradient Boosting CLassifier f1 score: 0.98

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

gbc =gbc.fit(X_train,y_train)

gbc_auc = roc_auc_score(y_test,gbc.predict(X_test))
print('Gradient Boosing Classifier AUC Score is %2.2f' % gbc_auc)

print(classification_report(y_test,gbc.predict(X_test)))


### ROC Graph

In [None]:
rf.predict_proba(X_test)[:,1]

In [None]:
from sklearn.metrics import roc_curve

lr_fpr,lr_tpr, lr_thresholds = roc_curve(y_test,lr.predict_proba(X_test)[:,1])

rf_fpr,rf_tpr, rf_thresholds = roc_curve(y_test,rf.predict_proba(X_test)[:,1])

gbc_fpr,gbc_tpr, gbc_thresholds = roc_curve(y_test,gbc.predict_proba(X_test)[:,1])

fig, ax = plt.subplots(figsize=(10,7))

# Logistic Regression AUC
plt.plot(lr_fpr,lr_tpr, label = 'Logistic Regression (area = %2.2f)' %lr_auc)
# Random Forest AUC
plt.plot(rf_fpr,rf_tpr, label = 'Random Forest (area = %2.2f)' %rf_auc)
# Gradient Boosting Classification AUC
plt.plot(gbc_fpr,gbc_tpr, label = 'Gradient Boosting Classification (area = %2.2f)' %gbc_auc)
# Base AUC
plt.plot([0,1],[0,1], label = 'Base Area')

ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate', title='ROC graph')

plt.legend(loc="lower right")


In [None]:
# Confusion Matrix - Logistic regression
confusion_matrix(y_test,lr.predict(X_test))

In [None]:
# Confusion Matrix - Random Forest
confusion_matrix(y_test,rf.predict(X_test))

In [None]:
# Confusion Matrix - Gradient Boosting Classification
confusion_matrix(y_test,gbc.predict(X_test))

### - Random Forest feature Importance

In [None]:
rf.feature_importances_

In [None]:
feature_importance = pd.DataFrame(rf.feature_importances_,
            index=X_test.columns,
            columns=['importance']).sort_values('importance',ascending=False)
feature_importance = feature_importance.reset_index() 
feature_importance

In [None]:
fig, ax = plt.subplots(figsize=(10,6)) 
sns.barplot(x='importance',y='index',data=feature_importance)
ax.set(xlabel='Importance',ylabel='Features', title='Feature Importance')
plt.show()

#### -- Apply random Noise ---

In [None]:
#np.random.normal(0,1,10)
X_train_noise = pd.DataFrame(X_train)
X_train_noise['randomNoise']=np.random.normal(0,1,X_train_noise.shape[0])

rf_random = RandomForestClassifier()
rf_normal =rf_random.fit(X_train_noise,y_train)

feature_importance_noise = pd.DataFrame(rf_random.feature_importances_,index=X_train_noise.columns,
                                                 columns=['importance']).sort_values('importance',ascending=False)
feature_importance_noise = feature_importance_noise.reset_index() 
feature_importance_noise

In [None]:
fig, ax = plt.subplots(figsize=(10,6)) 
clrs =['red' if (x==5) else 'green' for x in feature_importance_noise.index.values]
sns.barplot(x='importance',y='index',data=feature_importance_noise, palette=clrs)
ax.set(xlabel='Importance',ylabel='Features', title='Feature Importance')
plt.show()

### ---- Save Best Model ----

In [None]:
from sklearn.externals import joblib
joblib.dump(rf,'Employee Turnover/EmployeeTurnoverModel-v1.pkl')