# Titanic Competition
You should build an end-to-end machine learning pipeline to predict survivors of the Titanic disaster and participate in the corresponding Kaggle competition. In particular, you should do the following:
- Read the Titanic competition page on [Kaggle](https://www.kaggle.com/competitions/titanic/overview).
- Load the `titanic` dataset using [Pandas](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html). You can find this dataset in the datasets folder.
- Split the dataset into training and test sets using [Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html).
- Build an end-to-end machine learning pipeline, including all necessary steps, to have a running solution with some performance.
- Collaborate with your groupmates to finalize your pipeline by
    - reading the discussion forum to learn from other community members;
    - discussing the bottlenecks of your current solution;
    - running experiments on your pipeline;
    - improving the performance of your pipeline.
- Test the best pipeline on the test set and report various [evaluation metrics](https://scikit-learn.org/0.15/modules/model_evaluation.html).  
- Present your pipeline.
- Submit your predictions to Kaggle.

### Import libraries

In [169]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics

### Import datasets

In [170]:
titanic_db = pd.read_csv('/Users/adolfomytr/Documents/Alemania/Master/GISMA/Materias/teaching-main/datasets/titanic.csv')
titanic_db.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Explore dataset

In [171]:
titanic_db.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [172]:
titanic_db.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [173]:
#Drop useless columns and null entries
titanic_db = titanic_db.drop(['Name', 'Ticket', 'Cabin'], axis=1)
titanic_db = titanic_db.set_index('PassengerId')
titanic_db = titanic_db.dropna()
titanic_db.head()



Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.25,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.925,S
4,1,1,female,35.0,1,0,53.1,S
5,0,3,male,35.0,0,0,8.05,S


### Divide into training and test set

In [174]:
titanic_train, titanic_test = sklearn.model_selection.train_test_split(titanic_db)

x_train = titanic_train.drop(['Survived'], axis=1)
x_test = titanic_test.drop(['Survived'], axis=1)
y_train = titanic_train['Survived']
y_test = titanic_test['Survived']

### Encode and standarize categorical and numerical variables

In [175]:
ct = ColumnTransformer(transformers=[
                                    ('cat', OneHotEncoder(), ['Sex', 'Embarked']),
                                    ('num', StandardScaler(), ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'])
                                    ])

x_train = ct.fit_transform(titanic_train)
x_test = ct.transform(titanic_test)

print('x_train', x_train.shape)
print('y_train', y_train.shape)
print('x_test', x_test.shape)
print('y_test', y_test.shape)


x_train (534, 10)
y_train (534,)
x_test (178, 10)
y_test (178,)


### Train the model

In [176]:
svm_grid = {'C':[0.01, 0.1, 1, 10], 'kernel':['linear', 'poly', 'rbf']}
knn_grid = {'n_neighbors': [2, 3, 4, 5, 6, 7], 'p':[1, 2]}
dt_grid = {'criterion':['gini', 'entropy', 'log_loss'], 'max_depth': [3, 4, 5, 6, 7, 8, 9]}
lr_grid = {'max_iter':[1000, 2000, 3000, 4000, 5000, 6000]}

In [177]:
grid_search_svm = GridSearchCV(SVC(), svm_grid, cv=5, error_score='raise')
grid_search_knn = GridSearchCV(KNeighborsClassifier(), knn_grid, cv=5, error_score='raise')
grid_search_dt = GridSearchCV(DecisionTreeClassifier(), dt_grid, cv=5, error_score='raise')
grid_search_lr = GridSearchCV(LogisticRegression(), lr_grid, cv=5, error_score='raise')

In [178]:
grid_search_svm.fit(x_train, y_train)
grid_search_knn.fit(x_train, y_train)
grid_search_dt.fit(x_train, y_train)
grid_search_lr.fit(x_train, y_train)

In [179]:
print('best_score SVM', grid_search_svm.best_score_)
print('best_params SVM', grid_search_svm.best_params_)
print('best_estimator SVM', grid_search_svm.best_estimator_)
print('')
print('best_score KNN', grid_search_knn.best_score_)
print('best_params KNN', grid_search_knn.best_params_)
print('best_estimator KNN', grid_search_knn.best_estimator_)
print('')
print('best_score DT', grid_search_dt.best_score_)
print('best_params DT', grid_search_dt.best_params_)
print('best_estimator DT', grid_search_dt.best_estimator_)
print('')
print('best_score LR', grid_search_lr.best_score_)
print('best_params LR', grid_search_lr.best_params_)
print('best_estimator LR', grid_search_lr.best_estimator_)

best_score SVM 0.8259037206841826
best_params SVM {'C': 1, 'kernel': 'rbf'}
best_estimator SVM SVC(C=1)

best_score KNN 0.8033327455475225
best_params KNN {'n_neighbors': 7, 'p': 1}
best_estimator KNN KNeighborsClassifier(n_neighbors=7, p=1)

best_score DT 0.8258508199612061
best_params DT {'criterion': 'entropy', 'max_depth': 3}
best_estimator DT DecisionTreeClassifier(criterion='entropy', max_depth=3)

best_score LR 0.7865455827896314
best_params LR {'max_iter': 1000}
best_estimator LR LogisticRegression(max_iter=1000)


### Test the model

In [180]:
y_predicted = grid_search_dt.predict(x_test)
accuracy = sklearn.metrics.accuracy_score(y_test, y_predicted)
print('Accuracy:', accuracy)

Accuracy: 0.7921348314606742
