
# Importing the packages

In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import preprocessing

# Loading the dataset and Checking For Null Values

In [2]:
attrition = pd.read_csv("general_data.csv")

In [3]:
attrition.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

# Filling the Null Values

In [5]:
attrition["NumCompaniesWorked"]= attrition["NumCompaniesWorked"].fillna(method = "pad")

In [6]:
attrition["NumCompaniesWorked"].isna().sum()

0

In [7]:
attrition["TotalWorkingYears"]= attrition["TotalWorkingYears"].fillna(method = "pad")

In [8]:
attrition["TotalWorkingYears"].isna().sum()

0

In [9]:
attrition.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


# Converting String to Int

In [10]:
label_encoder = preprocessing.LabelEncoder()

In [11]:
attrition["Attrition"]=label_encoder.fit_transform(attrition["Attrition"])
attrition["BusinessTravel"]=label_encoder.fit_transform(attrition["BusinessTravel"])
attrition["Department"]=label_encoder.fit_transform(attrition["Department"])
attrition["EducationField"]=label_encoder.fit_transform(attrition["EducationField"])
attrition["Gender"]=label_encoder.fit_transform(attrition["Gender"])
attrition["Over18"]=label_encoder.fit_transform(attrition["Over18"])
attrition["JobRole"]=label_encoder.fit_transform(attrition["JobRole"])
attrition["MaritalStatus"]=label_encoder.fit_transform(attrition["MaritalStatus"])

# Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
rf_model = RandomForestClassifier(n_estimators = 1000, max_features = 2,oob_score=True)

In [15]:
attrition.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [16]:
features = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
            'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [17]:
rf_model.fit(X= attrition[features],y= attrition["Attrition"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
print("OOB ACCURACY",rf_model.oob_score_)

OOB ACCURACY 0.9997732426303855


# Checking for Important Values

In [19]:
for feature , imp in zip(features,rf_model.feature_importances_):
    print(feature,imp)

Age 0.09202990949744587
BusinessTravel 0.02739420104028073
Department 0.025241963882408364
DistanceFromHome 0.06586348779495763
Education 0.040033467185707064
EducationField 0.03991488148312497
EmployeeCount 0.0
EmployeeID 0.038327118023763075
Gender 0.017024155639446637
JobLevel 0.035950345305835255
JobRole 0.05298330249399505
MaritalStatus 0.03829836967946722
MonthlyIncome 0.08885863466877843
NumCompaniesWorked 0.05366506215053584
Over18 0.0
PercentSalaryHike 0.06273260207033633
StandardHours 0.0
StockOptionLevel 0.03314900954573482
TotalWorkingYears 0.08363558283093234
TrainingTimesLastYear 0.04352899856939019
YearsAtCompany 0.06651470126619662
YearsSinceLastPromotion 0.04184837328915114
YearsWithCurrManager 0.05300583358251253


## Inference : This shows that the important features are Age,DistanceFromHome,MonthlyIncome,PercentSalaryHike,TotalWorkingYears,YearsAtCompany

# Decision Tree

In [67]:
tree_model=tree.DecisionTreeClassifier(max_depth= 12)

In [68]:
predictors = pd.DataFrame([attrition["Age"],attrition["DistanceFromHome"],attrition["MonthlyIncome"],attrition["PercentSalaryHike"],attrition["TotalWorkingYears"],attrition["YearsAtCompany"]]).T

In [69]:
tree_model.fit(X=predictors,y=attrition["Attrition"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=12,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [70]:
tree_model.score(X=predictors,y=attrition["Attrition"])

0.9478458049886621

In [72]:
with open("Dtree2","w") as f:
    f = tree.export_graphviz(tree_model,feature_names=["Age","DistanceFromHome","MonthlyIncome","PercentSalaryHike","TotalWorkingYears","YearsAtCompany"],out_file=f);