# Titanic Prediction with Cross Validation

In [1]:
#Data Analysis
import pandas as pd
import numpy as np

#Machine learning models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

## Aqcuering the dataset 

In [2]:
train = pd.read_csv("../input/titanic/train.csv", index_col = 'PassengerId')

Lets take a look at the stucture of our dataset

In [3]:
print(train.shape)
print('-'*40)
print(train.info())
print('-'*50)
print(train.describe())
print('-'*50)
print(train.describe(include=['O']))
print('-'*50)
print(train.head(5))

(891, 11)
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
None
--------------------------------------------------
         Survived      Pclass         Age       SibSp       Parch        Fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381

**What can we observe from our dataset ?**

* We have 891 observations and 11 variables
* We have 6 quantitatives and 5 qualitatives
* Variables _Age_,_Cabin_ and _Embarked_ seemed to be having missing values.
* Ticket Variable seems to be less usefull and may be droped


**Missing values detection and imputation**

In [4]:
(train.isnull().sum().sort_values(ascending=False)*100)/train.shape[0]

Cabin       77.104377
Age         19.865320
Embarked     0.224467
Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
dtype: float64

* _Cabin_ variable contains **77.10%** of missing values so its can be said to be useless for us, so it will droped.
* _Age_ variable contains **19.87%** of missing values and so can be imputed using a chosen method
* Lastly the _Embarked_ variable contain less than **1%** of missing values so can easily be imputed using a choosen methode.

In [5]:
#deleting of un needed columns
train.drop(['Cabin','Ticket','Name'], axis=1 ,inplace=True)

In [6]:
#Imputing missing values for Age variable with the mean

train.Age = train.Age.fillna(train.Age.median())

In [7]:
#Imputing missing values for Embarked variable with the most frequent

train = train.apply(lambda x:x.fillna(x.value_counts().index[0]))

Now we can passe to the the preprocessing stage

### Transforming the qualitatives variables into quantitative


In [8]:
train['Sex'] = train['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
train['Embarked'] = train['Embarked'].map( {'Q': 0, 'C': 1,'S':2} ).astype(int)
train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,0,22.0,1,0,7.25,2
2,1,1,1,38.0,1,0,71.2833,1
3,1,3,1,26.0,0,0,7.925,2
4,1,1,1,35.0,1,0,53.1,2
5,0,3,0,35.0,0,0,8.05,2


In [9]:
X = train.drop('Survived',axis=1)
y = train['Survived']

## K-Fold Cross Validation Methode

**Decision Tree**

In [10]:
#Decision Tree
Dt = DecisionTreeClassifier()
scores_Dt = cross_val_score(Dt, X, y, cv = 8)
print(scores_Dt.mean(), scores_Dt.std())

0.7857343951093951 0.049070364795654756


**Logistique Regression**

In [11]:
#Logistique regression
glm = LogisticRegression(max_iter=300)
scores_glm = cross_val_score(glm, X, y, cv = 8)
print(scores_glm.mean(), scores_glm.std())

0.7957388191763193 0.030352610644971884


**Gradient Boosting Classifier**

In [12]:
#
grd = GradientBoostingClassifier(n_estimators=300)
scores_grd = cross_val_score(grd, X, y, cv = 8)
print(scores_grd.mean(), scores_grd.std())

0.8283059845559846 0.03268820370907079


**Random Forest Classifier**

In [13]:
#Random Forest Classifier
rand = RandomForestClassifier(n_estimators=300)
scores_rand = cross_val_score(rand, X, y, cv = 6)
print(scores_rand.mean(), scores_rand.std())

0.8159426204728218 0.02729052769943232


**Neural Network**

In [14]:
#Neural Network
mlp = MLPClassifier(max_iter=300)
scores_mlp = cross_val_score(mlp, X, y, cv = 8)
print(scores_mlp.mean(), scores_mlp.std())



0.8114241473616474 0.02821914773460174


KNN

In [15]:
#KNN
knn = KNeighborsClassifier()
scores_knn = cross_val_score(knn, X, y, cv = 8)
print(scores_knn.mean(), scores_knn.std())

0.6937238577863578 0.04246311865876194


**Gradient Boosting Classifier (82%) outperforms the other four models on all performance measures.**

## **Thanks and see you Next time!!!!** :-)