# Titanic: Machine Learning from Disaster
### [https://www.kaggle.com/c/titanic/overview](http://)

**The purpose of this notebook is to demonstrate the implementation of different ensembling algorithms on sklearn**

### Initially the classifiers below were used for training:

* Random Forest Classifier 
* Extra Trees Classifier 
* K Neighbors Classifier 
* Support Vector Classification 
* Ridge Classifier 


1. Bagging Classifier - Then, above classifiers were used as a base estimator in bagging ensemble

2. Voting Classifier - in the next part, combination of the above models were used as estimators in the voting classifier (hard voting)

3. XGboost - Finally, XGboost ensemble was used seperately for training


In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['train.csv', 'gender_submission.csv', 'test.csv']


In [2]:
import pandas as pd
import numpy as np
import sklearn
import re
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier,BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import cross_val_score

In [3]:
# load the data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
PassengerId = test['PassengerId']

### Preprocessing

In [4]:
# check if there are any missing values
train['Has_Cabin'] = ~train["Cabin"].isnull()
test['Has_Cabin'] = ~test["Cabin"].isnull()

In [5]:
full_data = [train, test]

In [6]:
# Create new feature FamilySize as a combination of SibSp and Parch
for dataset in full_data:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# Create new feature IsAlone from FamilySize
for dataset in full_data:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# Remove all NULLS in the Embarked column
for dataset in full_data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
# Remove all NULLS in the Fare column and create a new feature CategoricalFare
for dataset in full_data:
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
train['CategoricalFare'] = pd.qcut(train['Fare'], 4)

In [7]:
# Create a New feature CategoricalAge
for dataset in full_data:
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
train['CategoricalAge'] = pd.cut(train['Age'], 5)

In [8]:
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""
# Create a new feature Title, containing the titles of passenger names
for dataset in full_data:
    dataset['Title'] = dataset['Name'].apply(get_title)
# Group all non-common titles into one single grouping "Rare"
for dataset in full_data:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

In [9]:
for dataset in full_data:
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 16, 'Age']  = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;

In [10]:
drop_elements = ['Name', 'Ticket', 'Cabin', 'SibSp']
train = train.drop(drop_elements, axis = 1)
test  = test.drop(drop_elements, axis = 1)
train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)

X_train = train.drop(labels = ['Survived'], axis=1).set_index('PassengerId')
y_train = train.set_index('PassengerId')['Survived']
X_test = test.set_index('PassengerId')

In [11]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,1,1,0,0,0,False,2,0,1
2,1,0,2,0,3,1,True,2,0,3
3,3,0,1,0,1,0,False,1,1,2
4,1,0,2,0,3,0,True,2,0,3
5,3,1,2,0,1,0,False,1,1,1


In [12]:
X_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Parch,Fare,Embarked,Has_Cabin,FamilySize,IsAlone,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,1,2,0,0,2,False,1,1,1
893,3,0,2,0,0,0,False,2,0,3
894,2,1,3,0,1,2,False,1,1,1
895,3,1,1,0,1,0,False,1,1,1
896,3,0,1,1,1,0,False,3,0,3


In [13]:
print(X_train.shape)
print(X_test.shape)

(891, 10)
(418, 10)


# 1. Random Forest Classifier

### Random Forest Classifier - Simple

Result on Kaggle test: % **77.51**

In [14]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(rf.score(X_train, y_train)*100))
# cv score for Random Forest Classifier
cv_scores = cross_val_score(rf, X_train, y_train, cv=10, n_jobs=-1)

print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]"  
                   .format(rf.__class__.__name__, 
                   cv_scores.mean(), cv_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = rf.predict(X_test)

Accuracy on train data: % 89.79
Mean of: 0.810, std: (+/-) 0.037 [RandomForestClassifier]


### Random Forest Classifier as base estimator for Bagging Classifier
Result on Kaggle test: % **77.51**

In [15]:
rf = RandomForestClassifier()
rf_bag = BaggingClassifier(rf, max_samples=0.4, max_features=10)
rf_bag.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(rf_bag.score(X_train, y_train)*100))

## cv_score for rf_bag
bagging_scores = cross_val_score(rf_bag, X_train, y_train, cv=10, n_jobs=-1)
print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n"
                       .format(rf_bag.__class__.__name__, 
                        bagging_scores.mean(), bagging_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = rf_bag.predict(X_test)

Accuracy on train data: % 87.43
Mean of: 0.814, std: (+/-) 0.040 [Bagging BaggingClassifier]



# 2. Extra Trees Classifier

### Extra Trees Classifier - Simple

Result on Kaggle test: % **75.59**

In [16]:
et = ExtraTreesClassifier()
et.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(et.score(X_train, y_train)*100))
# cv score for Random Forest Classifier
cv_scores = cross_val_score(et, X_train, y_train, cv=10, n_jobs=-1)

print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]"  
                   .format(et.__class__.__name__, 
                   cv_scores.mean(), cv_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = et.predict(X_test)


Accuracy on train data: % 89.90
Mean of: 0.805, std: (+/-) 0.037 [ExtraTreesClassifier]


### Extra Trees Classifier as base estimator for Bagging Classifier
Result on Kaggle test: % **75.59**

In [17]:
et = ExtraTreesClassifier()
et_bag = BaggingClassifier(et, max_samples=0.4, max_features=10)
et_bag.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(et_bag.score(X_train, y_train)*100))

## cv_score for rf_bag
bagging_scores = cross_val_score(et_bag, X_train, y_train, cv=10, n_jobs=-1)
print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n"
                       .format(et_bag.__class__.__name__, 
                        bagging_scores.mean(), bagging_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = et_bag.predict(X_test)

Accuracy on train data: % 87.99
Mean of: 0.810, std: (+/-) 0.030 [Bagging BaggingClassifier]



# 3. K Neighbors Classifier

### K Neighbors Classifier - Simple

Result on Kaggle test: % ****

In [18]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(knn.score(X_train, y_train)*100))
# cv score 
cv_scores = cross_val_score(knn, X_train, y_train, cv=10, n_jobs=-1)

print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]"  
                   .format(knn.__class__.__name__, 
                   cv_scores.mean(), cv_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = knn.predict(X_test)


Accuracy on train data: % 84.85
Mean of: 0.786, std: (+/-) 0.048 [KNeighborsClassifier]


### K Neighbors Classifier as base estimator for Bagging Classifier
Result on Kaggle test: % **76.07**

In [19]:
knn = KNeighborsClassifier()
knn_bag = BaggingClassifier(knn, max_samples=0.4, max_features=10)
knn_bag.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(knn_bag.score(X_train, y_train)*100))

## cv_score for rf_bag
bagging_scores = cross_val_score(knn_bag, X_train, y_train, cv=10, n_jobs=-1)
print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n"
                       .format(knn_bag.__class__.__name__, 
                        bagging_scores.mean(), bagging_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = knn_bag.predict(X_test)

Accuracy on train data: % 83.73
Mean of: 0.818, std: (+/-) 0.024 [Bagging BaggingClassifier]



# 4. Support Vector Classification

### Support Vector Classification - Simple

Result on Kaggle test: % ****

In [20]:
svc = SVC()
svc.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(svc.score(X_train, y_train)*100))
# cv score 
cv_scores = cross_val_score(svc, X_train, y_train, cv=10, n_jobs=-1)

print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]"  
                   .format(svc.__class__.__name__, 
                   cv_scores.mean(), cv_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = svc.predict(X_test)


Accuracy on train data: % 83.50
Mean of: 0.829, std: (+/-) 0.031 [SVC]


### Support Vector Classification as base estimator for Bagging Classifier
Result on Kaggle test: % ****

In [21]:
svc = SVC()
svc_bag = BaggingClassifier(knn, max_samples=0.4, max_features=10)
svc_bag.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(svc_bag.score(X_train, y_train)*100))

## cv_score for rf_bag
bagging_scores = cross_val_score(svc_bag, X_train, y_train, cv=10, n_jobs=-1)
print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n"
                       .format(svc_bag.__class__.__name__, 
                        bagging_scores.mean(), bagging_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = svc_bag.predict(X_test)

Accuracy on train data: % 84.40
Mean of: 0.813, std: (+/-) 0.035 [Bagging BaggingClassifier]



# 5. Ridge Classifier

### Ridge Classifier - Simple

Result on Kaggle test: % ****

In [22]:
rg = RidgeClassifier()

rg.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(rg.score(X_train, y_train)*100))
# cv score 
cv_scores = cross_val_score(rg, X_train, y_train, cv=10, n_jobs=-1)

print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]"  
                   .format(rg.__class__.__name__, 
                   cv_scores.mean(), cv_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = rg.predict(X_test)


Accuracy on train data: % 81.03
Mean of: 0.806, std: (+/-) 0.026 [RidgeClassifier]


### Ridge Classifier as base estimator for Bagging Classifier
Result on Kaggle test: % ****

In [23]:
rg = RidgeClassifier()
rg_bag = BaggingClassifier(rg, max_samples=0.4, max_features=10)
rg_bag.fit(X_train, y_train)
print("Accuracy on train data: % {:.2f}".format(rg_bag.score(X_train, y_train)*100))

## cv_score for rf_bag
bagging_scores = cross_val_score(rg_bag, X_train, y_train, cv=10, n_jobs=-1)
print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [Bagging {0}]\n"
                       .format(rg_bag.__class__.__name__, 
                        bagging_scores.mean(), bagging_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = rg_bag.predict(X_test)

Accuracy on train data: % 81.03
Mean of: 0.807, std: (+/-) 0.029 [Bagging BaggingClassifier]



# Voting Classifier
Soft Voting/Majority Rule classifier for unfitted estimators.
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html

### 1. Voting Ensemble of simple models

In [24]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()
knn = KNeighborsClassifier()
svc = SVC()
rg = RidgeClassifier()

In [25]:
eclf = VotingClassifier(estimators=[('Random Forests', rf), ('Extra Trees', et), ('KNeighbors', knn), ('SVC', svc), ('Ridge Classifier', rg)], voting='hard')
for clf, label in zip([rf, et, knn, svc, rg, eclf], ['Random Forest', 'Extra Trees', 'KNeighbors', 'SVC', 'Ridge Classifier', 'Ensemble']):
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.80 (+/- 0.04) [Random Forest]
Accuracy: 0.81 (+/- 0.04) [Extra Trees]
Accuracy: 0.79 (+/- 0.05) [KNeighbors]
Accuracy: 0.83 (+/- 0.03) [SVC]
Accuracy: 0.81 (+/- 0.03) [Ridge Classifier]
Accuracy: 0.82 (+/- 0.04) [Ensemble]


### 2. Voting Ensemble of bagging models

Kaggle test result: % **77.51** 

In [26]:
clf = [rf_bag, et_bag, knn_bag, svc_bag, rg_bag]
eclf = VotingClassifier(estimators=[('Random Forests Bag', rf_bag), ('Extra Trees Bag', et_bag), ('KNeighbors Bag', knn_bag), ('SVC Bag', svc_bag), ('Ridge Classifier Bag', rg_bag)], voting='hard')
for clf, label in zip([rf_bag, et_bag, knn_bag, svc_bag, rg_bag ,eclf], ['Random Forest Bag', 'Extra Trees Bag', 'KNeighbors Bag', 'SVC Bag', 'Ridge Classifier Bag', 'Ensemble Bag']):
    scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))


Accuracy: 0.81 (+/- 0.04) [Random Forest Bag]
Accuracy: 0.81 (+/- 0.04) [Extra Trees Bag]
Accuracy: 0.82 (+/- 0.03) [KNeighbors Bag]
Accuracy: 0.82 (+/- 0.03) [SVC Bag]
Accuracy: 0.80 (+/- 0.03) [Ridge Classifier Bag]
Accuracy: 0.82 (+/- 0.03) [Ensemble Bag]


In [27]:
# predict and create submission
eclf = VotingClassifier(estimators=[('Random Forests Bag', rf_bag), ('Extra Trees Bag', et_bag), ('KNeighbors Bag', knn_bag), ('SVC Bag', svc_bag), ('Ridge Classifier Bag', rg_bag)], voting='hard')
eclf.fit(X_train, y_train)

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = eclf.predict(X_test)

# XGboost

Kaggle test result: % **78.46** **

In [28]:
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}


dtrain = xgb.DMatrix(X_train, label=y_train)
xg = xgb.train(param, dtrain, 10)
dtest = xgb.DMatrix(X_test)
xgbpreds = xg.predict(dtest)
xgbpreds = xgbpreds > 0.5
xgbpreds = xgbpreds.astype(int)

print("Mean of: {1:.3f}, std: (+/-) {2:.3f} [{0}]"  
                   .format(xg.__class__.__name__, 
                   cv_scores.mean(), cv_scores.std()))

submission_df = pd.read_csv("../input/gender_submission.csv",index_col=0)
submission_df['Survived'] = xgbpreds

Mean of: 0.806, std: (+/-) 0.026 [Booster]


In [29]:
# import the modules we'll need
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# function that takes in a dataframe and creates a text link to  
# download it (will only work for files < 2MB or so)
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

# create a random sample dataframe
df = pd.DataFrame(np.random.randn(50, 4), columns=list('ABCD'))

# create a link to download the dataframe
create_download_link(submission_df, filename = "titanic.csv")

# ↓ ↓ ↓  Yay, download link! ↓ ↓ ↓ 