# **Titanic - Machine Learning from Disaster**


# Table of Contents

* [1: Modules](#Modules)
* [2: Preparing Data](#PreparingData)
    * [2.1: Read Data](#ReadData)
    * [2.2: Missed Data Status](#MissedDataStatus)
    * [2.3: Data Labeling](#DataLabeling)
    * [2.3: Data Visualization](#DataVisualization)
    * [2.3: Imputer Design](#ImputerDesign)
    * [2.3: Encoder Design](#EncoderDesign)
    * [2.3: Drop Useless Features](#DropUselessFeatures)
* [3: Dimensionality Reduction](#DimensionalityReduction)
    * [3.1: Sensitivity Analysis](#SensitivityAnalysis)
        * [2.3: Decide on effective features](#DecideonEffectiveFeatures)
    * [2.3: tSNE Algorithm](#tSNEAlgorithm)
    * [2.3: UMAP Algorithm](#UMAPAlgorithm)
    
* [4: Machine Learning Design](#MachineLearning)
    * [3.3: Scaling](#Scaling)
    * [3.3: Cross Validation Prediction](#CrossValidationPrediction)
    * [3.4: Tuning Hyper Parameters](#TuningHyperParameters)
    * [3.5: Pipe Line Design](#Pipeline)
    * [3.5: Prediction](#Prediction)
    


<a class="anchor" id="Modules"></a>
### **Modules** 

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
import math
from sklearn import svm, ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import RocCurveDisplay, PrecisionRecallDisplay
from sklearn.metrics import classification_report
from matplotlib.pyplot import figure
from sklearn.impute import SimpleImputer


### **Preparing Data** 
<a class="anchor" id="PreparingData"></a>


<a class="anchor" id="ReadData"></a>
#### **Read Data** 

In [None]:
# mdata = pd.read_csv('/kaggle/input/titanic/train.csv')
mdata = pd.read_csv('/kaggle/input/titanic/train.csv')
tdata = pd.read_csv('/kaggle/input/titanic/test.csv')
mdata.head()

In [None]:
mdata.describe()

<a class="anchor" id="MissedDataStatus"></a>
#### **Missed Data Status** 

In [None]:
plt.figure(dpi=100)
sns.heatmap(mdata.isnull(), cmap="Greens")
plt.xlabel('Feature')
plt.ylabel('Observation')


<a class="anchor" id="DataLabeling"></a>
#### **Data Labeling**

In [None]:
def data_cleaner(data):
    data["Title"] = data["Name"].apply(lambda x: x[x.find(',') : x.find('.')][1:].strip())
    data["Title"] = data["Title"].apply(lambda x: 'Mr' if x == 'Mr' else x) # Mr
    data["Title"] = data["Title"].apply(lambda x: 'Miss' if x == 'Ms'or x == 'Mme' or x=='Mlle' else x) #Miss
    data["Title"] = data["Title"].apply(lambda x: 'Mrs' if x == 'Mrs' else x) #Mrs
    data["Title"] = data["Title"].apply(lambda x: 'Master' if x == 'Master' else x) #Master
    data["Title"] = data["Title"].apply(lambda x: 'Others' if x != 'Mr' and x != 'Miss' and x!= 'Mrs' and x!= 'Master' else x)

    data["Age_group"] = data["Age"].apply( lambda x: 'KIDs' if math.isnan(x)==False and x<=10 else x)
    data["Age_group"] = data["Age_group"].apply( lambda x: 'TEENs' if (type(x)!=str and math.isnan(x)==False and x<=20 and x >10) else x)
    data["Age_group"] = data["Age_group"].apply( lambda x: 'YOUNGs' if (type(x)!=str and math.isnan(x)==False and x<=40 and x >20) else x)
    data["Age_group"] = data["Age_group"].apply( lambda x: 'OLDs' if (type(x)!=str and math.isnan(x)==False and x >40) else x)
    
    data["FamSize"] = data["SibSp"]+data["Parch"]
    data["FamSizeLabel"] = data["FamSize"].apply( lambda x: 'Small' if int(x)<=2 else 'Large')

    data["FareInter"] = data["Fare"].apply(   
        lambda x: 'LowFare' if x<=7.910400 else ('MidFar' if float(x)>=7.910400 and float(x)<=14.454200 else "HighFare"))
                                  

    return data

data_cleaner(mdata)
data_cleaner(tdata)
mdata.head()

<a class="anchor" id="DataVisualization"></a>
#### **Data Visualization**

In [None]:
xcol = ["Age_group","Title", "Sex","Embarked", "Pclass","FamSizeLabel" ]
huecol = "Survived"
r = 3
c = 2
def mybar(data, xcol, huecol, r, c):
    i = -1
    sns.set_theme(style="ticks")
    f, ax = plt.subplots(r,c,figsize=(18, 7))
    sns.despine(f)
    for col in xcol:
        i+=1
        y = lambda y: int(0) if y<3 else int(1)
        x = lambda x: x if x<3 else int(x-3)
        sns.histplot( data, x=col, hue=huecol, multiple="stack", palette="light:m_r",
            edgecolor=".3", linewidth=.5, log_scale=False,ax = ax[x(i), y(i)])

<a class="anchor" id="ImputerDesign"></a>
#### **Imputer Design**

In [None]:
# # Imputers

Imputer = SimpleImputer(strategy= 'most_frequent', missing_values=np.nan)
Imputer.fit(mdata[['Age_group']])
mdata[['Age_group']] = Imputer.transform(mdata[["Age_group"]])
Imputer.fit(mdata[['Embarked']])
mdata[['Embarked']] = Imputer.transform(mdata[["Embarked"]])
mybar(mdata, xcol, huecol, r, c)

<a class="anchor" id="EncoderDesign"></a>
#### **Encoder Design**

In [None]:
# Encoding
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(mdata[["Embarked"]])
mdata[["S","Q","C"]] = \
    pd.DataFrame(enc.transform(mdata[["Embarked"]]).toarray())
#----
enc.fit(mdata[["Sex"]])
mdata[["F","M"]] = \
    pd.DataFrame(enc.transform(mdata[["Sex"]]).toarray())
#----
#----
enc.fit(mdata[["Age_group"]])
mdata[["Age1","Age2","Age3","Age4"]] = \
    pd.DataFrame(enc.transform(mdata[["Age_group"]]).toarray())
#----
enc.fit(mdata[["FamSizeLabel"]])
mdata[["Small","Large"]] = \
    pd.DataFrame(enc.transform(mdata[["FamSizeLabel"]]).toarray())
#----
enc.fit(mdata[["Pclass"]])
mdata[["C1","C2","C3"]] = \
    pd.DataFrame(enc.transform(mdata[["Pclass"]]).toarray())
#----
enc.fit(mdata[["FareInter"]])
mdata[["F1","F2","F3"]] = \
    pd.DataFrame(enc.transform(mdata[["FareInter"]]).toarray())
mdata.head()

<a class="anchor" id="DropUselessFeatures"></a>
#### **Drop Useless Features**
There are some useless columns like "Name", "Cabin" that their information does not contribute in making a model. Moreover those columns whose data are encoded to binaries are also removed. Because I am going to work with the binary data. 

In [None]:

col = ["Age","PassengerId","Sex","Name","Cabin","Ticket","Embarked","Title","FamSize","SibSp","Parch","Age_group","Pclass","FamSizeLabel",'Fare',"FareInter"]
mdata_drp = mdata.drop(col,axis = 1)
tdata_drp = tdata.drop(col,axis = 1)
mdata_drp

<a class="anchor" id="DimensionalityReduction"></a>
## **Dimensionality Reduction**

<a class="anchor" id="SensitivityAnalysis"></a>
#### **Sensitivity Analysis**

In [None]:
myheatmap=plt.figure(figsize=(15,8))
sns.heatmap(mdata_drp.corr(),annot=True)
plt.title("A",loc='left')

<a class="anchor" id="DecideonEffectiveFeatures"></a>
#### **Decide on Effective Features**

In [None]:
var_col =  ["S","C","F","M","Age1","Age2","Age4","Small","Large","C1","C2","C3",'F1',"F2","F3"]
fit_col = "Survived"
mdata_drp[var_col].head()


<a class="anchor" id="UMAPAlgorithm"></a>
#### **UMAP Algorithm**
Uniform Manifold Approximation and Projection (UMAP) is a dimension reduction technique that can be used for visualisation similarly to t-SNE, but also for general non-linear dimension reduction.

In [None]:
import umap
trans = umap.UMAP(n_neighbors=50, random_state=3).fit(mdata_drp[var_col])
plt.figure(figsize = (10,5))
plt.scatter(trans.embedding_[:, 0], trans.embedding_[:, 1], s= 10, c=[sns.color_palette()[x] for x in mdata_drp[fit_col]])
plt.xlabel("Dim1")
plt.ylabel("Dim2")

<a class="anchor" id="MachineLearning"></a>
## **Machine Learning**

<a class="anchor" id="Scaling"></a>
#### **Split Data**

In [None]:
y = mdata_drp[fit_col].values
y=np.reshape(y,(len(y),))
x_train, x_test, y_train, y_test = train_test_split(mdata_drp[var_col], y,
                                                    train_size=0.85,
                                                    random_state=0)
# x_std_train = MinMaxScaler().fit_transform(x_train)
# x_std_test = MinMaxScaler().fit_transform(x_test)

<a class="anchor" id="CrossValidationPrediction"></a>
#### **Cross Validation Prediction**

In [None]:
estimators = [svm.SVC(),
              ensemble.RandomForestClassifier(),
              KNeighborsClassifier(), SGDClassifier()]

def run_compare(estis, cv=3):
    for esti in estis:
        y_pred = cross_val_predict(esti, x_train,y_train, cv=cv)
        print(f"\nClassification Report for {esti}:")
        print("Accuracy= ", accuracy_score(y_pred, y_train))
        print(confusion_matrix(y_pred, y_train))
        print(classification_report(y_pred,y_train))
run_compare(estimators, cv=3)

<a class="anchor" id="TuningHyperParameters"></a>
#### **Tuning Hyper Parameters**

<a class="anchor" id="Pipeline"></a>
#### **Pipe Line Design**

<a class="anchor" id="Prediction"></a>
#### **Prediction**