In [0]:
#!pip install tensorflow==2.0.0

In [0]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV
import numpy as np

In [0]:
from google.colab import drive
drive.mount('/content/drive')

The following function will process the data before it is fed into the model for training or predicting. Sex will be encoded with one-hot encoding scheme as sex is not ordinal. There is no need to normalize other features. When selecting a feature to split, information gain will not be influenced by the magnitute of values. When preprocessing data, I am replacing all missing data with 0. It is not the ideal approach, however, because it affects the split of feature with missing data. Will re-approach this issue later. 

In [0]:
def processData(data):
  copy = data.drop(['Sex'], axis=1)
  copy.fillna(0, inplace=True) #Replacing all missing data with 0
  copy['Cabin'][copy['Cabin'] != 0] = 1
  xEncoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
  x1 = data[['Sex']]
  xEncoder.fit(x1)
  x1 = xEncoder.transform(x1).toarray()
  df = pd.DataFrame(data=x1, columns=['Male', 'Female'])
  output = pd.concat([copy, df], axis=1)
  #preprocessing.normalize(output, axis=1, copy=False) There is no need to normalize
  #print(output.head())
  return output

In [0]:
df = pd.read_csv('/content/drive/My Drive/Dataset/Titanic/train.csv', header=0)
train, test = train_test_split(df, test_size=0.2)
train.reset_index(inplace=True)
test.reset_index(inplace=True)
trainX = train.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Embarked'], axis=1)
testX = test.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Embarked'], axis=1)
trainY = train[['Survived']]
testY = test[['Survived']]
trainX = processData(trainX)
testX = processData(testX)

In [16]:
DT = DecisionTreeClassifier(criterion='entropy', random_state=42)
DT.fit(trainX, trainY)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best')

In [17]:
DT.score(testX, testY)

0.7206703910614525

In [18]:
RF = RandomForestClassifier(criterion='entropy')
RF.fit(trainX, trainY)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [19]:
RF.score(testX, testY)

0.776536312849162

Train a single decision tree model with some hyperparameter tuning using grid search

In [20]:
#tree.plot_tree(DT)
hyperPara = {'splitter':['best', 'random'], 'criterion':['gini', 'entropy'], 'min_samples_leaf':range(1, 21), 'min_samples_split':range(2, 41), 'min_impurity_decrease':np.linspace(0, 1, 11)}
#gridSearch = GridSearchCV(DT, hyperPara)
gridSearch = GridSearchCV(DecisionTreeClassifier(random_state=42), hyperPara)
gridSearch.fit(trainX, trainY)
print(gridSearch.best_params_)
gridSearch.best_estimator_.score(testX, testY)



{'criterion': 'entropy', 'min_impurity_decrease': 0.0, 'min_samples_leaf': 3, 'min_samples_split': 10, 'splitter': 'random'}


0.7988826815642458