# Siking of the Titanic

In [1]:
import numpy as np 
import tensorflow as tf 
import matplotlib.pyplot as plt 
import pandas as pd 
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('data_titanic.csv')

In [3]:
data

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex,passenger_survived
0,1,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.2500,,S,Lower,M,N
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,C,Upper,F,Y
2,3,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.9250,,S,Lower,F,Y
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1000,C123,S,Upper,F,Y
4,5,"Allen, Mr. William Henry",35.0,0,0,373450,8.0500,,S,Lower,M,N
5,6,"Moran, Mr. James",,0,0,330877,8.4583,,Q,Lower,M,N
6,7,"McCarthy, Mr. Timothy J",54.0,0,0,17463,51.8625,E46,S,Upper,M,N
7,8,"Palsson, Master. Gosta Leonard",2.0,3,1,349909,21.0750,,S,Lower,M,N
8,9,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",27.0,0,2,347742,11.1333,,S,Lower,F,Y
9,10,"Nasser, Mrs. Nicholas (Adele Achem)",14.0,1,0,237736,30.0708,,C,Middle,F,Y


## Prepare data

Identify null columns

In [4]:
data.columns[data.isna().any()].tolist()

['Age', 'Cabin', 'Embarked']

Specific characteristics are eliminated

In [5]:
data =data.drop(columns=["PassengerId","Name","Ticket"])

In [6]:
data['Cabin'].value_counts()

G6             4
B96 B98        4
C23 C25 C27    4
E101           3
F2             3
D              3
C22 C26        3
F33            3
E121           2
D20            2
B77            2
B28            2
D35            2
D33            2
C83            2
C52            2
B22            2
B58 B60        2
C68            2
D17            2
B5             2
D26            2
B51 B53 B55    2
B35            2
C93            2
E8             2
B20            2
B49            2
C125           2
C65            2
              ..
B101           1
D47            1
B73            1
D56            1
C111           1
D19            1
E36            1
C30            1
F38            1
D6             1
A10            1
E34            1
D10 D12        1
C110           1
B94            1
D45            1
C106           1
B82 B84        1
F E69          1
C7             1
B19            1
C62 C64        1
E46            1
B3             1
E17            1
A36            1
C90            1
B79           

In [7]:
print("percent of NaN in Cabine",data['Cabin'].isnull().sum().sum()/len(data))

percent of NaN in Cabine 0.7710437710437711


Cabin could be a good feature, but we don't have enough data to take this feature

In [8]:
data =data.drop(columns="Cabin")

We are going to generate different ages for NaN values

In [9]:
data=data.fillna({'Age':int(np.random.normal(data["Age"].mean(), data["Age"].std()))})
data["Age"]=data["Age"].astype('int64')

With SibSp and Parch columns we are going to create a new column Alone and then we are going to remove unnecessary columns

In [10]:
data['Travelers']=data["SibSp"]+data["Parch"]
data['Alone']=np.where(data['Travelers']>0, 0, 1)
data.drop('SibSp', axis=1, inplace=True)
data.drop('Parch', axis=1, inplace=True)
data.drop('Travelers', axis=1, inplace=True)

We are going to replace NaN Embarked values by mode

In [11]:
data["Embarked"].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [12]:
data["Embarked"] = data["Embarked"].fillna("S")

We are going to convert categorical sex column to codes M:1, F:0 and passenger_survived N:1, Y:0

In [13]:
data["passenger_sex"]=data["passenger_sex"].astype('category').cat.codes
data["passenger_survived"]=data["passenger_survived"].astype('category').cat.codes

We can make feature engineering to detect if a person is adult or kid, It's an important feature because the kids had priority to be saved

In [14]:
data['Adult']=np.where(data['Age']<=16, 0, 1)

At least, we are going to create one hot encoding to categorical columns: Embarked and passenger_class

In [15]:
data = pd.get_dummies(data, columns=["passenger_class"])
data = pd.get_dummies(data, columns=["Embarked"])

## Split data

In [16]:
X = data.loc[:, data.columns != 'passenger_survived']
Y = data['passenger_survived']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [18]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

## Metrics of evaluation

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import precision_score

We are going to create a logger to save configuration of models

In [20]:
def logExperiment(config,y_validation,Y_hat):
    try:
        df = pd.read_csv('log_models.csv', index_col=0)
    except:
        df = pd.DataFrame([], columns = ['Config' , 'Accuracy', 'Error' , 'Precision','Recall','F1'])
    
    df.append({'Config' : config , 
                  'Accuracy' : accuracy_score(y_validation, Y_hat), 
                  'Error':mean_squared_error(y_validation, Y_hat), 
                  'Precision':precision_score(y_validation, Y_hat),
                  'Recall':recall_score(y_validation, Y_hat),
                  'F1':f1_score(y_validation, Y_hat)},ignore_index=True).to_csv("log_models.csv", sep=',', encoding='utf-8')

## Decision Tree

In [21]:
from sklearn import tree
import graphviz

In [22]:
def trainDecisionTree(criterion="gini",max_depth=None, max_features=None,max_leaf_nodes=None):
    treeClassifier = tree.DecisionTreeClassifier(criterion=criterion,max_depth=max_depth,max_features=max_features,max_leaf_nodes = max_leaf_nodes)
    treeClassifier.fit(X_train, y_train)
    Y_hat = treeClassifier.predict(X_validation)
    logExperiment("decisionTree_criterio="+str(criterion)+"_max_depth="+str(max_depth)+"_max_features="+str(max_features)+"_max_leaf_nodes="+str(max_leaf_nodes),y_validation,Y_hat)
    return treeClassifier

In [54]:
treeClassifier = trainDecisionTree()

ValueError: could not convert string to float: 'passenger_class_Upper'

## Naive Bayes

In [24]:
TBayes = {}

In [25]:
copyX = X_train
bins = [0,14,30,50,80,100]
labels=[0,1,2,3,4]
copyX['Age'] = pd.cut(copyX['Age'], bins=bins, labels=labels, include_lowest=True)
binsFare = [0,100,200,300,400,500,600,700]
labelsFare =[0,1,2,3,4,5,6]
copyX['Fare'] = pd.cut(copyX['Fare'], bins=binsFare, labels=labelsFare, include_lowest=True)
index = y_train == 1
yesX = copyX[index]
noX = copyX[-index]
Pyes = len(yesX)/len(copyX)
Pno = len(noX)/len(copyX)

In [26]:
def generateTuple(key):
    TBayes[key] = copyX[key].value_counts()/len(copyX)
    TBayes["Y"+key] = yesX[key].value_counts()/len(yesX)
    TBayes["N"+key] = noX[key].value_counts()/len(noX)

In [27]:
generateTuple("Fare")
generateTuple("Age")
generateTuple("passenger_sex")
generateTuple("Alone")
generateTuple("Adult")

In [28]:
def generateClassTuple(dataset,key):
    nClassLower = sum(dataset["passenger_class_Lower"])
    nClassMiddle = sum(dataset["passenger_class_Middle"])
    nClassUpper = sum(dataset["passenger_class_Upper"])
    n = nClassLower + nClassUpper + nClassMiddle
    probX = {"passenger_class_Lower": (nClassLower/n),"passenger_class_Middle": (nClassMiddle/n),"passenger_class_Upper": (nClassUpper/n)}
    TBayes[key] = probX

In [29]:
def generateEmbarkedTuple(dataset,key):
    nClassLower = sum(dataset["Embarked_C"])
    nClassMiddle = sum(dataset["Embarked_Q"])
    nClassUpper = sum(dataset["Embarked_S"])
    n = nClassLower + nClassUpper + nClassMiddle
    probX = {"Embarked_C": (nClassLower/n),"Embarked_Q": (nClassMiddle/n),"Embarked_S": (nClassUpper/n)}
    TBayes[key] = probX

In [30]:
generateClassTuple(copyX,"passenger_class")
generateClassTuple(yesX,"Ypassenger_class")
generateClassTuple(noX,"Npassenger_class")

In [31]:
generateEmbarkedTuple(copyX,"Embarked")
generateEmbarkedTuple(yesX,"YEmbarked")
generateEmbarkedTuple(noX,"NEmbarked")

In [32]:
def bayesPredictionFunction(fare,age,passenger_sex,Alone,Adult,passenger_class,embarked):
    pYX = (Pyes*TBayes["YFare"][fare]*TBayes["YAge"][age]*TBayes["Ypassenger_sex"][passenger_sex]*TBayes["YAlone"][Alone]*TBayes["YAdult"][Adult]*TBayes["YEmbarked"][embarked]*TBayes["Ypassenger_class"][passenger_class])
    pNX = (Pyes*TBayes["NFare"][fare]*TBayes["NAge"][age]*TBayes["Npassenger_sex"][passenger_sex]*TBayes["NAlone"][Alone]*TBayes["NAdult"][Adult]*TBayes["NEmbarked"][embarked]*TBayes["Npassenger_class"][passenger_class])
    return pYX/(pYX+pNX)

In [85]:
def makeBayesPredictions(dataset):
    temp = dataset
    bins = [0,14,30,50,80,100]
    labels=[0,1,2,3,4]
    temp['Age'] = pd.cut(temp['Age'], bins=bins, labels=labels, include_lowest=True)
    binsFare = [0,100,200,300,400,500,600,700]
    labelsFare =[0,1,2,3,4,5,6]
    temp['Fare'] = pd.cut(temp['Fare'], bins=binsFare, labels=labelsFare, include_lowest=True)
    temp['passenger_class'] = dataset.iloc[:,5:8].idxmax(axis=1)
    temp['Embarked'] = dataset.iloc[:,8:11].idxmax(axis=1)
    predictions = []
    for index, row in temp.iterrows():
        predictions.append(bayesPredictionFunction(row['Fare'], row['Age'], row['passenger_sex'], row['Alone'], row['Adult'], row['passenger_class'], row['Embarked']))
    predictions = np.where(np.array(predictions) < 0.5, 0, 1)
    logExperiment("bayesPrediction",y_validation,predictions)
    return predictions.astype("int8")

In [86]:
makeBayesPredictions(X_validation)

array([1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1], dtype=int8)

## SVM

In [35]:
from sklearn import svm

In [56]:
def trainingSVM(kernel="linear"):
    clf = svm.SVC(kernel='linear')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_validation.iloc[:,:11])
    logExperiment("SVM_kernel="+kernel,y_validation,y_pred)
    return clf

In [57]:
SVMClassifier = trainingSVM()

## Ensamble models

In [93]:
def ensamble(data_to_predic):
    Y_hat = pd.DataFrame(np.column_stack((makeBayesPredictions(data_to_predic),
                       treeClassifier.predict(data_to_predic.iloc[:,:11]),
                       SVMClassifier.predict(data_to_predic.iloc[:,:11])))).mode(axis=1)
    return accuracy_score(y_validation, Y_hat)

In [94]:
ensamble(X_validation)

0.6853146853146853