# Compulsary Task 1

In [363]:
# import packages 
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

import matplotlib.pyplot as plt
%matplotlib inline

# importing the titanic data
titanic_data = pd.read_csv("titanic.csv")

### bringing in code from previous task for clean titanic dataset

In [364]:
titanic_data.isnull().sum() # checking for NANs

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [365]:
titanic_data['Age'] = titanic_data['Age'].fillna((titanic_data['Age'].mean())) # averaging out NANs in Age column

In [366]:
titanic_data.isnull().sum() # inspecting remaining NANs

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [367]:
# dropping columns I won't use for the ML model
titanic_data.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'],axis = 1, inplace = True)

In [368]:
# getting view of ramaining columns and their data types
print(titanic_data.columns) 
titanic_data.info()

Index(['Survived', 'Pclass', 'Sex', 'Age', 'Fare'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
Fare        891 non-null float64
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [369]:
titanic_data['Survived'] = titanic_data['Survived'].astype(str) # converting to str so that it can be categorical
print(titanic_data['Survived'].unique()) # testing that there are no strange outliers
print(titanic_data['Sex'].unique()) # inspecting unique values in Sex columns
titanic_data['Sex'].replace('male', 1, inplace = True) # change to integers for decision tree classifier
titanic_data['Sex'].replace('female', 0, inplace = True) # change to integers for decision tree classifier

# Survived will be the dependent while Pclass, Sex, Age and the fare paid will be the independent variables for purposes of the decision tree

['0' '1']
['male' 'female']


In [370]:
X = titanic_data.iloc[:,[1,2,3,4]].values #  setting up the independent variables and converting to numpy arrays
y = titanic_data.iloc[:,0].values # setting up the dependent categorical variable and converting to a numpy array

# reshaping to proper shape in numpy arrays
X = X.reshape(-1, 4)
y = y.reshape(-1, 1)

# printing for inspection to see if correct
print(X[0:5])
print(y[0:5])

[[ 3.      1.     22.      7.25  ]
 [ 1.      0.     38.     71.2833]
 [ 3.      0.     26.      7.925 ]
 [ 1.      0.     35.     53.1   ]
 [ 3.      1.     35.      8.05  ]]
[['0']
 ['1']
 ['1']
 ['1']
 ['0']]


### Splitting between 2 different data sets (training and test) with 75/25 relationship

In [371]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=1)

In [372]:
# inspecting shapes of the sets. Noted that Xtest and y_test has one more row than X_val and y_val (will be adapted to conder 178 rows for both)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(668, 4)
(223, 4)
(668, 1)
(223, 1)


### Creating a bagged decision tree.Testing vs base decision tree classifier

In [373]:
base = DecisionTreeClassifier(max_depth = 7) # base a single decision tree

ensemble = BaggingClassifier(base_estimator=base, n_estimators = 10, random_state = 7) # ensemble model to test for improved accuracy

base.fit(X_train, y_train.ravel()) # fitting base model
ensemble.fit(X_train, y_train.ravel()) # fitting ensemble (bagging) model

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=None, oob_score=False,
         random_state=7, verbose=0, warm_start=False)

In [374]:
# testing accuracies between base and ensembled bagging models
print("Accuracy base:", base.score(X_test, y_test))
print("Accuracy ensemble:", ensemble.score(X_test, y_test))

Accuracy base: 0.7982062780269058
Accuracy ensemble: 0.8071748878923767


about 1.4% increase in accuracy from incorporating an ensemble model with 10 n_estimators and 7 random states with same level of depth as base model against the base decision tree classifier with 7 levels of depth (based on testing accuracies vs test data sets)

### Creating a random forest decision tree classifier and testing vs base decision tree classifier

In [375]:
ensemble_rf = RandomForestClassifier(n_estimators = 100, random_state = 7, max_depth = 7) # ensemble variable updated to Random Forest Classifier
ensemble_rf.fit(X_train, y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=7, verbose=0, warm_start=False)

In [376]:
# testing accuracies between base and random forest classification models
print("Accuracy base:", base.score(X_test, y_test))
print("Accuracy ensemble:", ensemble_rf.score(X_test, y_test))

Accuracy base: 0.7982062780269058
Accuracy ensemble: 0.8161434977578476


about 2.2% increase in accuracy from incorporating an ensemble model (random forest with 100 n_estimators, 7 random states and the same max depth as the base model

### Creating an adaboost tree classifier and testing vs base decision tree classifier

In [377]:
ensemble_adb = AdaBoostClassifier(base, n_estimators = 100, random_state = 2)
ensemble_adb.fit(X_train, y_train.ravel())

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=100, random_state=2)

In [378]:
print("Accuracy base:", base.score(X_test, y_test))
print("Accuracy ensemble:", ensemble_adb.score(X_test, y_test))

Accuracy base: 0.7982062780269058
Accuracy ensemble: 0.7623318385650224


reduced accuracy from base model, even with including 100 n_estimators and a random state of 2. Might be indicative that the adaboostclassifier might be overfitting on the training data.

### picking a model above and tuning between different n_estimators and max_depth levels

In [379]:
# creating function to determine different accuracies for different n_estimators and max depth levels 
def rf_classifier(X, Y, x, y, n, d):
    
    rf = RandomForestClassifier(n_estimators = int(n), max_depth = int(d))
    rf.fit(X, Y.ravel())
    
    accuracy_score = rf.score(x, y)
    print("accuracy score is:")
    return accuracy_score

##### running a few functions below to see different levels of accuracy I will get when tinkering with different max depth levels and n-estimator scores

In [380]:
rf_classifier(X_train, y_train, X_test, y_test, 5, 1) # n-estimators of 5 and max depth of 1

accuracy score is:


0.7533632286995515

In [381]:
rf_classifier(X_train, y_train, X_test, y_test, 10, 2) # n-estimators of 5 and max depth of 1

accuracy score is:


0.7533632286995515

In [382]:
rf_classifier(X_train, y_train, X_test, y_test, 100, 3) # n-estimators of 5 and max depth of 1

accuracy score is:


0.7982062780269058

In [383]:
rf_classifier(X_train, y_train, X_test, y_test, 50, 4) # n-estimators of 5 and max depth of 1 

accuracy score is:


0.7982062780269058

In [384]:
rf_classifier(X_train, y_train, X_test, y_test, 25, 5) # n-estimators of 5 and max depth of 1

accuracy score is:


0.8026905829596412

In [385]:
rf_classifier(X_train, y_train, X_test, y_test, 100, 5) # n-estimators of 5 and max depth of 1

accuracy score is:


0.8026905829596412

In [386]:
rf_classifier(X_train, y_train, X_test, y_test, 100, 4) # n-estimators of 100 and max depth of 1

accuracy score is:


0.8071748878923767

#### based on all above models the initial random forest model with 100 n_estimators and 7 levels of depth had the best accuracy. values. however tend to change as I rerun the scripts each time and then the above functions might show higheraccuracies than the initial rf model.