In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

## Bank Loan Dataset

### Random Forest Classifier

In [6]:
df = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx",sheet_name=1)
df.head(2)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0


In [7]:
df.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [8]:
df1 = df.drop(["ID","ZIP Code"], axis=1)
df1.head(2)

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0


In [9]:
df2 = df1.dropna()

In [10]:
df3 = df2.drop_duplicates()

In [11]:
df3["CCAvg"] = np.round(df3["CCAvg"])

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
rf_model = RandomForestClassifier(n_estimators=1000,max_features=2,oob_score=True)

In [14]:
features = ['Age', 'Experience', 'Income','Family', 'CCAvg',
       'Education', 'Mortgage','Securities Account',
       'CD Account', 'Online', 'CreditCard']

In [15]:
rf_model.fit(X=df3[features],y=df3["Personal Loan"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
print("OOB Accuracy:",rf_model.oob_score_)

OOB Accuracy: 0.9859635051132946


In [18]:
for features,imp in zip(features,rf_model.feature_importances_):
    print(features,imp)

Age 0.05054492946305008
Experience 0.0501102995567064
Income 0.3643964756153079
Family 0.1001538427330361
CCAvg 0.1392498712524992
Education 0.16529151520666097
Mortgage 0.04807505754192634
Securities Account 0.006437168213841238
CD Account 0.05541251383090458
Online 0.009352711144139483
CreditCard 0.01097561544192755


### As per above result, Income, CCAvg, Family and Education can be considered as the most important features for Decision Tree. 

### Decision Tree Classifier

In [20]:
from sklearn import tree

In [21]:
tree_model = tree.DecisionTreeClassifier()

In [22]:
tree_model = tree.DecisionTreeClassifier(max_depth=8,max_leaf_nodes=10)

In [23]:
predictors = pd.DataFrame([df3["Education"],df3["CCAvg"],df3["Income"],df3["Family"]]).T

In [24]:
tree_model.fit(X=predictors,y=df3["Personal Loan"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [25]:
with open("Dtree.dot",'w') as f:
    f = tree.export_graphviz(tree_model,feature_names=["Education","CCAvg","Income","Family"],out_file=f)

### Rules

## Attrition Dataset

### Random Forest Classifier

In [32]:
df4 = pd.read_csv("general_data.csv")

In [33]:
df4.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [34]:
df5 = df4.dropna()

In [92]:
df6 = df5.drop(["EmployeeCount","EmployeeID","Over18"],axis=1)

In [39]:
df6.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,...,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,...,131160,1.0,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,...,41890,0.0,23,8,1,6.0,3,5,1,4


In [43]:
from sklearn import preprocessing

In [44]:
le = preprocessing.LabelEncoder()

In [45]:
df6["Attrition"] = le.fit_transform(df6["Attrition"])

In [46]:
df6["BusinessTravel"] = le.fit_transform(df6["BusinessTravel"])

In [47]:
df6["Department"] = le.fit_transform(df6["Department"])

In [48]:
df6["EducationField"] = le.fit_transform(df6["EducationField"])

In [49]:
df6["Gender"] = le.fit_transform(df6["Gender"])

In [50]:
df6["MaritalStatus"] = le.fit_transform(df6["MaritalStatus"])

In [51]:
df6["JobRole"] = le.fit_transform(df6["JobRole"])

In [53]:
df7 = df6.drop(["StandardHours"],axis=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
rf_model = RandomForestClassifier(n_estimators=1000,max_features=2,oob_score=True)

In [58]:
features = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'PercentSalaryHike','StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [60]:
rf_model.fit(X=df7[features],y=df7["Attrition"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [61]:
print("OOB Accuracy:",rf_model.oob_score_)

OOB Accuracy: 1.0


In [62]:
for features,imp in zip(features,rf_model.feature_importances_):
    print(features,imp)

Age 0.09732199493699017
BusinessTravel 0.027927924435656868
Department 0.025832909892841833
DistanceFromHome 0.06991944096112329
Education 0.04017886870487844
EducationField 0.041347140664795445
Gender 0.018762206066075086
JobLevel 0.03752797460403193
JobRole 0.055663724350019414
MaritalStatus 0.03990699274501416
MonthlyIncome 0.09359254803444136
NumCompaniesWorked 0.05604484417408818
PercentSalaryHike 0.06535378400776413
StockOptionLevel 0.034571378282607805
TotalWorkingYears 0.0860070479949808
TrainingTimesLastYear 0.04477582196761492
YearsAtCompany 0.06906253754614655
YearsSinceLastPromotion 0.04291917167040378
YearsWithCurrManager 0.05328368896052586


### As per above result, Age, Monthly Income, Total Working Years can be considered as the most important features for Decision Tree.

### Decision Tree Classifier

In [63]:
from sklearn import tree

In [64]:
tree_model = tree.DecisionTreeClassifier()

In [65]:
tree_model = tree.DecisionTreeClassifier(max_depth=6,max_leaf_nodes=10)

In [73]:
predictors = pd.DataFrame([df7["Age"],df7["MonthlyIncome"],df7["TotalWorkingYears"]]).T

In [74]:
tree_model.fit(X=predictors,y = df7["Attrition"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [75]:
with open("Dtree1.dot",'w') as f:
    f = tree.export_graphviz(tree_model,feature_names=["Age","MonthlyIncome","TotalWorkingYears"],out_file=f)

### Rules

## Titanic Dataset

### Decision Tree Classifier

In [76]:
df8 = pd.read_csv("titanic_train.csv")

In [77]:
df9 = df8.dropna()

In [78]:
from sklearn import preprocessing

In [79]:
le = preprocessing.LabelEncoder()

In [80]:
df9["Sex"] = le.fit_transform(df9["Sex"])

In [82]:
from sklearn import tree

In [83]:
tree_model = tree.DecisionTreeClassifier(max_depth=6,max_leaf_nodes=10)

In [85]:
predictors = pd.DataFrame([df9["Fare"],df9["Age"],df9["Sex"]]).T

In [86]:
tree_model.fit(X=predictors,y = df9["Survived"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
                       max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [88]:
with open("Dtree2.dot",'w') as f:
    f = tree.export_graphviz(tree_model,feature_names=["Fare","Age","Sex"],out_file=f)

### Rules