#Using Decision Trees, Random Forest and Support Vector Classfiers on Employee dataset

### Import all required libraries

In [None]:
from pandas import read_csv, get_dummies, Series, DataFrame
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from sklearn import metrics
from sklearn import ensemble

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV  #method 2

from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline    #method 3

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV   #method 3

### Load and Read Dataset

In [None]:
data = read_csv('/content/drive/MyDrive/ML_Stats/Emloyees.csv')

## Data Exploration & Preparation

In [None]:
data.head()

Unnamed: 0,Age,PastEmployee,BusinessTravel,Department,DistanceFromHome,EducationField,EnvironmentSatisfaction,Gender,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,OverTime
0,46,No,Travel_Rarely,Human Resources,5,Medical,2,Male,Human Resources,Divorced,5021,8,Yes
1,37,Yes,Travel_Rarely,Human Resources,6,Human Resources,3,Male,Human Resources,Divorced,2073,4,Yes
2,59,No,Non-Travel,Human Resources,2,Human Resources,3,Female,Manager,Married,18844,9,No
3,54,No,Non-Travel,Human Resources,26,Human Resources,4,Female,Manager,Single,17328,2,Yes
4,26,No,Travel_Rarely,Human Resources,25,Life Sciences,3,Female,Human Resources,Married,2942,1,No


In [None]:
data.describe()

Unnamed: 0,Age,DistanceFromHome,EnvironmentSatisfaction,MonthlyIncome,NumCompaniesWorked
count,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,9.192517,2.721769,6502.931293,2.693197
std,9.135373,8.106864,1.093082,4707.956783,2.498009
min,18.0,1.0,1.0,1009.0,0.0
25%,30.0,2.0,2.0,2911.0,1.0
50%,36.0,7.0,3.0,4919.0,2.0
75%,43.0,14.0,4.0,8379.0,4.0
max,60.0,29.0,4.0,19999.0,9.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1470 non-null   int64 
 1   PastEmployee             1470 non-null   object
 2   BusinessTravel           1470 non-null   object
 3   Department               1470 non-null   object
 4   DistanceFromHome         1470 non-null   int64 
 5   EducationField           1470 non-null   object
 6   EnvironmentSatisfaction  1470 non-null   int64 
 7   Gender                   1470 non-null   object
 8   JobRole                  1470 non-null   object
 9   MaritalStatus            1470 non-null   object
 10  MonthlyIncome            1470 non-null   int64 
 11  NumCompaniesWorked       1470 non-null   int64 
 12  OverTime                 1470 non-null   object
dtypes: int64(5), object(8)
memory usage: 149.4+ KB


### 1. Data Encoding

In [None]:
#Binary encosing (for 2 entities)
data['PastEmployee'] = data['PastEmployee'].map({'Yes':0, 'No': 1})
data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 0})
data['OverTime'] = data['OverTime'].map({'Yes': 1, 'No': 0})

#One-hot encoding - get dummies (for 3 or more entities)
data2 = get_dummies(data, columns = ['BusinessTravel', 'Department', 'EducationField', 'JobRole', 'MaritalStatus'])

data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 32 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   Age                                1470 non-null   int64
 1   PastEmployee                       1470 non-null   int64
 2   DistanceFromHome                   1470 non-null   int64
 3   EnvironmentSatisfaction            1470 non-null   int64
 4   Gender                             1470 non-null   int64
 5   MonthlyIncome                      1470 non-null   int64
 6   NumCompaniesWorked                 1470 non-null   int64
 7   OverTime                           1470 non-null   int64
 8   BusinessTravel_Non-Travel          1470 non-null   bool 
 9   BusinessTravel_Travel_Frequently   1470 non-null   bool 
 10  BusinessTravel_Travel_Rarely       1470 non-null   bool 
 11  Department_Human Resources         1470 non-null   bool 
 12  Department_Research 

###2. Dividing (X & Y) - seperatingg output

In [None]:
#Dropping output feature
X = data2.drop('PastEmployee', axis = 1)
Y = data2['PastEmployee']

print('Rows and Columns of X: ', X.shape)
print('Rows and columns of Y: ', Y.shape)

Rows and Columns of X:  (1470, 31)
Rows and columns of Y:  (1470,)


### 3. Data Scaling

In [None]:
#importing standard scalar form sklearn preprocessing library
X_scaled = StandardScaler().fit_transform(X)

###4. Data Splitting (to x_train, x_test, y_train, y_test)

In [None]:
#importing test_train_split from sklearn.modelselection library
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size = 0.2, random_state = 100)  #setting Test set as 20%


###5. Data Balancing  - SMOTE

In [None]:
#importing SMOTE from imblearn library
X_train, Y_train = SMOTE(random_state = 10).fit_resample(X_train, Y_train)

In [None]:
print(X_train.shape)
print(Y_train.shape)

(1974, 31)
(1974,)


## 1. Decision Tree Classifier

#### a. Using Method 2 (Gridsearch CV) first - toget the Max depth

In [None]:
#Import library - from sklearn.model_selection import GridSearchCV

DT_classifier2 = DecisionTreeClassifier(criterion = 'entropy')      #building the DT classifier
depth = {'max_depth': [5, 2, 3, 4, 5, 10, 15, 18, 20, 22, 25, 27, 29, 31]}

grid_search_DT = GridSearchCV(estimator = DT_classifier2, param_grid = depth, scoring = 'precision', cv =  10)  #building
grid_search_DT.fit(X_scaled, Y)    #training, testing, evaluation & ranking

DT_best_parameters = grid_search_DT.best_params_
print(DT_best_parameters)
DT_best_result = grid_search_DT.best_score_
print('Best Result is:', round(DT_best_result,4))

{'max_depth': 22}
Best Result is: 0.8798


####b. Now using Method 1, as we have the max_depth (hyperparameter)

In [None]:
#import library - from sklearn.tree import DecisionTreeClassifier
#import library - from sklearn import metrics  - for Accuracy & Confusion Matrix

DT_classifier1 = DecisionTreeClassifier(criterion = 'entropy', max_depth = 10)     # 1. building Cclassifier using the correct max_depth
DT_classifier1.fit(X_train, Y_train)   # 2. training
Y_pred1 = DT_classifier1.predict(X_test)    # 3. Testing

# 4. Evaluation        #Confusion Matrix
Accuracy_DT = metrics.accuracy_score(Y_test, Y_pred1)      #Calculating Accuracy
print('Decision Tree Accuracy is: ', round(Accuracy_DT,4))

conf_matrix_DT = metrics.confusion_matrix(Y_test, Y_pred1)   #Calculating Confusion Matrix
print('Decision Tree Confusion Matrix is: ')
print(conf_matrix_DT)

recall_DT = metrics.recall_score(Y_test, Y_pred1)         #Calculating Recall
print('Decision Tree Recall is: ', round(recall_DT,4))

precision_DT = metrics.precision_score(Y_test, Y_pred1)    #Calculating Precision
print('Decision Tree Precision is: ', round(precision_DT,4))

f1_DT = metrics.f1_score(Y_test, Y_pred1)               #Calculating f1
print('Decision Tree f1 is: ', round(f1_DT,4))

#finding out the important features
imp_features_DT = Series(DT_classifier1.feature_importances_, index = list(X)).sort_values(ascending = False)
print('')
print(imp_features_DT)

Decision Tree Accuracy is:  0.7653
Decision Tree Confusion Matrix is: 
[[ 20  28]
 [ 41 205]]
Decision Tree Recall is:  0.8333
Decision Tree Precision is:  0.8798
Decision Tree f1 is:  0.8559

OverTime                             0.250251
MonthlyIncome                        0.125461
EnvironmentSatisfaction              0.120381
Age                                  0.100706
DistanceFromHome                     0.057879
MaritalStatus_Single                 0.055305
NumCompaniesWorked                   0.054853
Department_Research & Development    0.035329
BusinessTravel_Travel_Frequently     0.033615
MaritalStatus_Married                0.024413
JobRole_Laboratory Technician        0.023372
JobRole_Sales Executive              0.022908
Gender                               0.017954
EducationField_Medical               0.011655
EducationField_Life Sciences         0.009522
MaritalStatus_Divorced               0.009114
JobRole_Manufacturing Director       0.008450
EducationField_Technical 

## 2. Random Forest Classifier

####a. Using method 3 first - Pipeline - To get n_estimators (HYperparameter)

In [None]:
#Using method 3 - pipeline & Gridsearch CV
#building the classifier
RF_classifier2 = Pipeline([('balancing', SMOTE(random_state = 101)), ('classification', RandomForestClassifier(criterion = 'entropy', max_features = 'sqrt', random_state = 1))])
no_trees = {'classification of n_estimators is': [10,20,30,40,50,100]}

grid_search_RF = GridSearchCV(estimator = RF_classifier2, param_grid = no_trees, scoring = 'precision', cv = 5)
grid_search_RF.fit(X_scaled, Y)

RF_best_parameters = grid_search_RF.best_params_
print(RF_best_parameters)
RF_best_result = grid_search_RF.best_score_
print('Best Result is:', round(RF_best_result,4))


ValueError: Invalid parameter 'classification of n_estimators is' for estimator Pipeline(steps=[('balancing', SMOTE(random_state=101)),
                ('classification',
                 RandomForestClassifier(criterion='entropy', random_state=1))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].

####b. Use Method 1 with the best number of trees - as we have the Hyperparameter - n_estimators

In [None]:
# Random Forest Classifier (method 1)
#from sklearn.ensemble import RandomForestClassifier

RF_classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', max_features = 'sqrt', random_state = 10)  # 1.building model classifier
RF_classifier.fit(X_train, Y_train)  # 2.training
Y_pred_RF = RF_classifier.predict(X_test)  # 3.testing

# 4.evaluation          & Confusion Matrix
Accuracy_RF = metrics.accuracy_score(Y_test, Y_pred_RF)      #Calculating Accuracy
print('Random Forest Accuracy is: ', round(Accuracy_RF,4))

conf_matrix_RF = metrics.confusion_matrix(Y_test, Y_pred_RF)   #Calculating Confusion Matrix
print('Random Forest Confusion Matrix is: ')
print(conf_matrix_RF)

recall_RF = metrics.recall_score(Y_test, Y_pred_RF)         #Calculating Recall
print('Random Forest Recall is: ', round(recall_RF,4))

precision_RF = metrics.precision_score(Y_test, Y_pred_RF)    #Calculating Precision
print('Random Forest Precision is: ', round(precision_RF,4))

f1_RF = metrics.f1_score(Y_test, Y_pred_RF)               #Calculating f1
print('Random Forest f1 is: ', round(f1_RF,4))

#finding out the important features
imp_features_RF = Series(RF_classifier.feature_importances_, index = list(X)).sort_values(ascending = False)
print('')
print(imp_features_RF)

Random Forest Accuracy is:  0.8605
Random Forest Confusion Matrix is: 
[[ 19  29]
 [ 12 234]]
Random Forest Recall is:  0.9512
Random Forest Precision is:  0.8897
Random Forest f1 is:  0.9194

OverTime                             0.140478
MonthlyIncome                        0.108444
Age                                  0.097778
EnvironmentSatisfaction              0.089860
DistanceFromHome                     0.077378
NumCompaniesWorked                   0.070032
MaritalStatus_Single                 0.049197
Gender                               0.046607
MaritalStatus_Married                0.035391
BusinessTravel_Travel_Frequently     0.028169
MaritalStatus_Divorced               0.025896
EducationField_Life Sciences         0.022930
BusinessTravel_Travel_Rarely         0.021230
EducationField_Medical               0.020960
Department_Sales                     0.015845
JobRole_Laboratory Technician        0.015050
Department_Research & Development    0.014633
JobRole_Sales Executive  

## 3. Support Vector Classifier (SVM)

#### a. Using method 3 first - Pipeline - To get n_estimators (HYperparameter)

In [None]:
#Using method 3 - pipeline & Gridsearch CV
#building the classifier
SV_classifier2 = Pipeline([('balancing', SMOTE(random_state = 101)), ('classification', SVC())])   #Building model/classifier
kernels_c = {'classification__kernel': ['linear','poly','rbf','sigmoid'], 'classification__C': [.001,.01,.1,1,10,100]}

grid_search1 = GridSearchCV(estimator=SV_classifier2, param_grid=kernels_c, scoring='recall', cv=5)   # training
grid_search1.fit(X_scaled, Y)   # testing

SV_best_parameters = grid_search1.best_params_
print(SV_best_parameters)
SV_best_result = grid_search1.best_score_
print('Best Result is:', round(SV_best_result,4))


{'classification__C': 0.001, 'classification__kernel': 'poly'}
Best Result is: 1.0


#### b. Use Method 1 with the best number of trees - as we have the Hyperparameter - c & kernals

In [None]:
#building SVM using method 1
#from sklearn.svm import SVC - importing library
SV_classifier = SVC(kernel = 'poly')        # 1.building model/classifier
SV_classifier.fit(X_train, Y_train)       # 2.training
Y_pred_SV = SV_classifier.predict(X_test)  # 3.testing

# 4.evaluation          & Confusion Matrix
Accuracy_SV = metrics.accuracy_score(Y_test, Y_pred_SV)      #Calculating Accuracy
print('Support Vector Accuracy is: ', round(Accuracy_SV,4))

conf_matrix_SV = metrics.confusion_matrix(Y_test, Y_pred_SV)   #Calculating Confusion Matrix
print('Support Vector Confusion Matrix is: ')
print(conf_matrix_SV)

recall_SV = metrics.recall_score(Y_test, Y_pred_SV)         #Calculating Recall
print('Support Vector Recall is: ', round(recall_SV,4))

precision_SV = metrics.precision_score(Y_test, Y_pred_SV)    #Calculating Precision
print('Support Vector Precision is: ', round(precision_SV,4))

f1_SV = metrics.f1_score(Y_test, Y_pred_SV)               #Calculating f1
print('Support Vector f1 is: ', round(f1_SV,4))

#finding out the important features
#imp_features_SV = Series(SV_classifier.feature_importances_, index = list(X)).sort_values(ascending = False)
#print('')
#print(imp_features_SV)

Support Vector Accuracy is:  0.7483
Support Vector Confusion Matrix is: 
[[ 27  21]
 [ 53 193]]
Support Vector Recall is:  0.7846
Support Vector Precision is:  0.9019
Support Vector f1 is:  0.8391
