<ins>Instructions</ins>
1. Titanic dataset: Build a decision tree and predict outcome with the test dataset   
   {DV: Survived / IDV: Age, Gender, Fare}<br><br>

2. Attrition dataset: Build a random forest to identify relevant features, then build a decision tree using those features  
   {DV: Attrition / IDV: Output of Random Forest}<br><br>

3. Bank loan dataset: Same as above  
   {DV: Personal Loan / IDV: Output of Random Forest}

In [1]:
# Add collapse buttons
from IPython.core.display import display, HTML
with open('collapse.html') as f:
    display(HTML(f.read()))

In [2]:
# Import libraries
import pandas as pd
import os
from sklearn import tree
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

## 1.1 Titanic dataset: Decision Tree

In [3]:
# Data: https://www.kaggle.com/c/titanic/data
df = pd.read_csv('titanic_train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Handle missing values

In [4]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
df.shape

(891, 12)

In [6]:
mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(value=mean)
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Handle categorical values

In [7]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
encoder = preprocessing.LabelEncoder()
df['Sex'] = encoder.fit_transform(df['Sex'])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


### Train model

In [9]:
model = tree.DecisionTreeClassifier(max_depth=6)
model.fit(X=df[['Age', 'Sex', 'Fare']], y=df['Survived'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [10]:
model.score(X=df[['Age', 'Sex', 'Fare']], y=df['Survived'])

0.8249158249158249

In [11]:
with open("DT_titanic.dot", "w") as f:
    tree.export_graphviz(
        model,
        feature_names=['Age', 'Sex', 'Fare'],
        out_file=f)

    print("Decision Tree exported in", f.name)

Decision Tree exported in DT_titanic.dot


Use http://webgraphviz.com/ to visualize it

* Female & Fare > 24.808 & Fare <= 25.698 &rarr; 3 died
* Female & Fare > 25.698 & Fare <= 26.125 & Age <= 25.5 &rarr; 5 survived
* Female & Fare > 26.125 & Fare <= 27.825 &rarr; 10 survived
* Female & Fare > 27.825 & Fare <= 28.856 &rarr; 4 died
* Female & Fare > 28.856 & Fare <= 48.2 & Age > 21.5 & Age <= 38.5 &rarr; 7 survived
* Female & Fare > 28.856 & Fare <= 36.688 & Age > 38.5 &rarr; 3 died
* Female & Fare > 48.2 & Age <= 8.0 &rarr; 1 died
* Female & Fare > 48.2 & Age > 8.0 & Age <= 24.5 &rarr; 24 survived
* Female & Fare > 48.2 & Age > 24.5 & Age <= 25.5 &rarr; 1 died
* Female & Fare > 48.2 & Age > 29.85 &rarr; 49 survived
* Male & Fare <= 20.825 & Age <= 6.5 &rarr; 8 survived
* Male & Fare > 20.825 & Age <= 0.96 &rarr; 2 survived
* Male & Fare > 39.344 & Fare <= 64.379 & Age > 0.96 & Age <= 6.5 &rarr; 3 died
* Male & Fare > 64.379 & Age > 0.96 & Age <= 6.5 &rarr; 1 survived
* Male & Fare <= 26.269 & Age > 6.5 & Age <= 10.0 &rarr; 2 survived
* Male & Fare <= 26.269 & Age > 10.0 & Age <= 11.5 &rarr; 1 died
* Male & Fare <= 26.269 & Age > 11.5 & Age <= 13.5 &rarr; 1 survived
* Male & Fare <= 7.91 & Age > 32.5 &rarr; 36 died
* Male & Fare > 26.269 & Fare <= 26.469 & Age > 6.5 &rarr; 4 survived
* Male & Fare > 387.665 & Age > 6.5 &rarr; 2 survived

## 1.2 Titanic dataset: Predict outcome

In [12]:
df_test = pd.read_csv('titanic_test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


### Handle missing values

In [13]:
df_test['Age']  = df_test['Age'].fillna(value=mean)
df_test['Fare'] = df_test['Fare'].fillna(value=df['Fare'].mean())
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

### Handle categorical values

In [14]:
df_test['Sex'] = encoder.fit_transform(df_test['Sex'])
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S


### Predict outcome

In [15]:
predict = model.predict(X=df_test[['Age', 'Sex', 'Fare']])
predict[:5]

array([0, 1, 0, 0, 1])

In [16]:
df_predict = pd.DataFrame({
    "PassengerId": df_test['PassengerId'],
    "Survived": predict})
df_predict.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [17]:
df_predict.to_csv("titanic_output.csv", index=False)
print("Predictions saved in titanic_output.csv")

Predictions saved in titanic_output.csv


---

## 2-1 Attrition dataset: Random Forest

In [18]:
df = pd.read_csv('../Day 10/general_data.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


### Handle missing values

df.isna().sum()

In [19]:
df.shape

(4410, 24)

In [20]:
df.dropna(inplace=True)
df.isna().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

### Handle categorical data

In [21]:
df.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeID                   int64
Gender                      object
JobLevel                     int64
JobRole                     object
MaritalStatus               object
MonthlyIncome                int64
NumCompaniesWorked         float64
Over18                      object
PercentSalaryHike            int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears          float64
TrainingTimesLastYear        int64
YearsAtCompany               int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

In [22]:
df.drop(['EmployeeID', 'EmployeeCount', 'Over18', 'StandardHours'], axis=1, inplace=True)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,Female,1,Healthcare Representative,Married,131160,1.0,11,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,Female,1,Research Scientist,Single,41890,0.0,23,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,Male,4,Sales Executive,Married,193280,1.0,15,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,Male,3,Human Resources,Married,83210,3.0,11,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,Male,1,Sales Executive,Single,23420,4.0,12,2,9.0,2,6,0,4


In [23]:
encoder = preprocessing.LabelEncoder()

for col in df.select_dtypes('object').columns:
    df[col] = encoder.fit_transform(df[col])

df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,2,2,6,2,1,0,1,0,1,131160,1.0,11,0,1.0,6,1,0,0
1,31,1,1,1,10,1,1,0,1,6,2,41890,0.0,23,1,6.0,3,5,1,4
2,32,0,1,1,17,4,4,1,4,7,1,193280,1.0,15,3,5.0,2,5,0,3
3,38,0,0,1,2,5,1,1,3,1,1,83210,3.0,11,3,13.0,5,8,7,5
4,32,0,2,1,10,1,3,1,1,7,2,23420,4.0,12,2,9.0,2,6,0,4


### Train model

In [24]:
X=df.drop('Attrition', axis=1)
y=df['Attrition']

In [25]:
model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)
model.fit(X=X, y=y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
print("OOB Accuracy:", model.oob_score_)

OOB Accuracy: 1.0


In [27]:
df_rank = pd.DataFrame(data=[], columns=['Feature', 'Importance'])

for feature, imp in zip(X.columns, model.feature_importances_):
    df_rank.loc[len(df_rank)] = [feature, imp]

df_rank.sort_values(by=['Importance'], ascending=False)

Unnamed: 0,Feature,Importance
0,Age,0.097472
10,MonthlyIncome,0.094938
14,TotalWorkingYears,0.086064
3,DistanceFromHome,0.070769
16,YearsAtCompany,0.068081
12,PercentSalaryHike,0.065728
11,NumCompaniesWorked,0.055431
8,JobRole,0.054951
18,YearsWithCurrManager,0.053305
15,TrainingTimesLastYear,0.04509


## 2-2 Attrition dataset: Decision Tree

In [28]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,2,2,6,2,1,0,1,0,1,131160,1.0,11,0,1.0,6,1,0,0
1,31,1,1,1,10,1,1,0,1,6,2,41890,0.0,23,1,6.0,3,5,1,4
2,32,0,1,1,17,4,4,1,4,7,1,193280,1.0,15,3,5.0,2,5,0,3
3,38,0,0,1,2,5,1,1,3,1,1,83210,3.0,11,3,13.0,5,8,7,5
4,32,0,2,1,10,1,3,1,1,7,2,23420,4.0,12,2,9.0,2,6,0,4


In [29]:
features = [
    'Age',
    'MonthlyIncome',
    'TotalWorkingYears',
    'DistanceFromHome',
    'YearsAtCompany',
    'PercentSalaryHike']

model = tree.DecisionTreeClassifier(max_depth=12)
model.fit(X=df[features], y=df['Attrition'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=12,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [30]:
model.score(X=df[features], y=df['Attrition'])

0.9415791875855773

In [31]:
with open("DT_Attrition.dot", "w") as f:
    tree.export_graphviz(model, feature_names=features, out_file=f)

    print("Decision Tree exported in", f.name)

Decision Tree exported in DT_Attrition.dot


---

## 3-1 Bank loan dataset: Random Forest

In [32]:
df = pd.read_excel('../Day 21/Bank_Personal_Loan_Modelling.xlsx', sheet_name=1)
df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


### Handle missing values

In [33]:
df.isna().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

### Handle categorical values

In [34]:
df.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object

### Train model

In [35]:
X = df.drop(['Personal Loan', 'ID', 'ZIP Code'], axis=1)
y = df['Personal Loan']
X.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,1,0,0,0
1,45,19,34,3,1.5,1,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,1


In [36]:
model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)
model.fit(X=X, y=y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [37]:
print("OOB Accuracy:", model.oob_score_)

OOB Accuracy: 0.9872


In [38]:
df_rank = pd.DataFrame(data=[], columns=['Feature', 'Importance'])

for feature, imp in zip(X.columns, model.feature_importances_):
    df_rank.loc[len(df_rank)] = [feature, imp]

df_rank.sort_values(by=['Importance'], ascending=False)

Unnamed: 0,Feature,Importance
2,Income,0.341199
4,CCAvg,0.191675
5,Education,0.158814
3,Family,0.095146
8,CD Account,0.055024
0,Age,0.045161
6,Mortgage,0.044443
1,Experience,0.044366
10,CreditCard,0.010012
9,Online,0.008407


## 3-2 Bank loan dataset: Decision Tree

In [39]:
features = [
    'Income',
    'CCAvg',
    'Education',
    'Family',
    'CD Account']

model = tree.DecisionTreeClassifier(max_depth=10)
model.fit(X=df[features], y=df['Personal Loan'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [40]:
model.score(X=df[features], y=df['Personal Loan'])

0.9972

In [41]:
with open("DT_Bankloan.dot", "w") as f:
    tree.export_graphviz(model, feature_names=features, out_file=f)

    print("Decision Tree exported in", f.name)

Decision Tree exported in DT_Bankloan.dot
