# Importing Packages

In [1]:
#Linear Regression
import pandas as pd
from sklearn.model_selection import train_test_split,StratifiedShuffleSplit

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report,confusion_matrix

import pydotplus
from IPython.display import Image
from sklearn import tree

from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

from imblearn.over_sampling import SMOTE



# Data Acquisition

In [2]:
data = pd.read_csv("C:\\Users\\A\\Downloads\\WA_Fn-UseC_-HR-Employee-Attrition.csv")

# Data Analysis

In [3]:
data.shape

(1470, 35)

In [4]:
sum(data.isnull().sum())

0

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
BusinessTravel              1470 non-null object
DailyRate                   1470 non-null int64
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EducationField              1470 non-null object
EmployeeCount               1470 non-null int64
EmployeeNumber              1470 non-null int64
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
HourlyRate                  1470 non-null int64
JobInvolvement              1470 non-null int64
JobLevel                    1470 non-null int64
JobRole                     1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome         

In [6]:
data.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7


In [7]:
y = data.Attrition
X = data.drop(['Attrition'],axis=1)

### Pre-Processing

In [8]:
#We may have to encode the data to proceed further
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
X= X.apply(LabelEncoder().fit_transform)

# Data Split

In [9]:
# OverSampling the data
X_resampled_sm, y_resampled_sm = SMOTE().fit_sample(X,y)

In [10]:
X_resampled_sm = pd.DataFrame(X_resampled_sm)
y_resampled_sm = pd.DataFrame(y_resampled_sm)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled_sm,y_resampled_sm,test_size=0.3,random_state=42)

In [12]:
y_train.rename(columns={0:'Attrition'},inplace=True)
y_test.rename(columns={0:'Attrition'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


# Model Building

In [13]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()
model4 = MultinomialNB()

In [14]:
model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)
model4.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
  This is separate from the ipykernel package so we can avoid doing imports until
  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
pred1=model1.predict(X_test)
pred2=model2.predict(X_test)
pred3=model3.predict(X_test)
pred4=model4.predict(X_test)

In [16]:
pred1_proba=model1.predict_proba(X_test)
pred2_proba=model2.predict_proba(X_test)
pred3_proba=model3.predict_proba(X_test)
pred4_proba=model4.predict_proba(X_test)

### Soft Voting

In [17]:
finalpred=(pred1_proba+pred2_proba+pred3_proba+pred4_proba)/4

In [18]:
finalpred

array([[0.94112602, 0.05887398],
       [0.40406826, 0.59593174],
       [0.95672851, 0.04327149],
       ...,
       [0.11718093, 0.88281907],
       [0.07937322, 0.92062678],
       [0.93742818, 0.06257182]])

In [19]:
ensemble_model = VotingClassifier(estimators=[('lr', model1), ('dt', model2),('rf',model3),('nb',model4)], voting='hard')
ensemble_model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('dt',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,...
                                        

In [20]:
ensemble_prediction = ensemble_model.predict(X_test)

In [21]:
print(classification_report(y_test,ensemble_prediction))

              precision    recall  f1-score   support

          No       0.78      0.94      0.85       369
         Yes       0.93      0.74      0.82       371

    accuracy                           0.84       740
   macro avg       0.85      0.84      0.84       740
weighted avg       0.85      0.84      0.84       740



### Hard Voting

In [22]:
ensemble_model = VotingClassifier(estimators=[('lr', model1), ('dt', model2),('rf',model3),('nb',model4)], voting='hard')
ensemble_model.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('dt',
                              DecisionTreeClassifier(class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,...
                                        

In [23]:
ensemble_prediction = ensemble_model.predict(X_test)

In [24]:
print(classification_report(y_test,ensemble_prediction))

              precision    recall  f1-score   support

          No       0.79      0.93      0.85       369
         Yes       0.92      0.75      0.83       371

    accuracy                           0.84       740
   macro avg       0.85      0.84      0.84       740
weighted avg       0.85      0.84      0.84       740



### XGBoost Classifier

In [25]:
from xgboost import XGBClassifier

In [26]:
# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

In [27]:
xgb_prediction = model.predict(X_test)

In [28]:
print(classification_report(xgb_prediction,y_test))

              precision    recall  f1-score   support

          No       0.98      0.89      0.93       404
         Yes       0.88      0.97      0.93       336

    accuracy                           0.93       740
   macro avg       0.93      0.93      0.93       740
weighted avg       0.93      0.93      0.93       740

