## Import the libraries

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

## Read the data

In [2]:
df = pd.read_csv("titanic.csv")

In [3]:
df

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.2500
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.9250
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1000
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.0500
...,...,...,...,...,...,...,...,...
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0000
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0000
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.4500
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0000


## EDA

In [4]:
#shape
df.shape

(887, 8)

In [5]:
df.dtypes

Survived                     int64
Pclass                       int64
Name                        object
Sex                         object
Age                        float64
Siblings/Spouses Aboard      int64
Parents/Children Aboard      int64
Fare                       float64
dtype: object

In [6]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.25,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.1375
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
df.isnull().sum(axis = 0)

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

In [8]:
df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived')

Unnamed: 0,Pclass,Survived
2,3,0.244353
1,2,0.472826
0,1,0.62963


In [9]:
df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived')

Unnamed: 0,Sex,Survived
1,male,0.190227
0,female,0.742038


## Features transformation and selection

In [10]:
#The feature "sex" is string so we need to encode it 
df = pd.get_dummies(df, columns=["Sex"])

## Split the dataset in train, test and validation

I will use gridsearchCV with CV > 1 (cross validation) so i only take 10% in a validation set to try the best model

In [11]:
train, val = train_test_split(df, test_size=0.1)

I remove also the Name of the passenger and "Sex_male" as the information is already in "Sex_female"

In [12]:
X_train = train.drop(["Survived","Name", "Sex_male"], axis=1)
Y_train = train["Survived"]

In [13]:
X_val = val.drop(["Survived","Name", "Sex_male"], axis=1)
Y_val = val["Survived"]

## Logistic regression

In [14]:
param_grid_lr = [
    {'penalty': ['l1'], 'solver': ['liblinear'], 'C': [0.001, 0.01, 0.1, 1, 10]},
    {'penalty': ['l2'], 'solver': ['lbfgs', 'newton-cg', 'liblinear'], 'C': [0.001, 0.01, 0.1, 1, 10]}
 ]

In [15]:
cv_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='accuracy')

In [16]:
cv_lr.fit(X_train, Y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l1'],
                          'solver': ['liblinear']},
                         {'C': [0.001, 0.01, 0.1, 1, 10], 'penalty': ['l2'],
                          'solver': ['lbfgs', 'newton-cg', 'liblinear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=Fa

In [17]:
cv_lr.best_score_

0.8032783018867924

In [18]:
lr = cv_lr.best_estimator_

In [19]:
lr.fit(X_train, Y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
y_pred = lr.predict(X_val)

In [21]:
print(classification_report(Y_val, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.91      0.86        57
           1       0.80      0.62      0.70        32

    accuracy                           0.81        89
   macro avg       0.81      0.77      0.78        89
weighted avg       0.81      0.81      0.80        89



## SCV

In [22]:
param_grid_scv = {'C':[0.1, 1, 10],'gamma':[1, 0.1], 'kernel':['linear','rbf']}

In [23]:
cv_svc = GridSearchCV(SVC(), param_grid_scv, cv=3, verbose=1)

In [24]:
cv_svc.fit(X_train, Y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  1.5min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10], 'gamma': [1, 0.1],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [25]:
cv_svc.best_score_

0.7869674185463659

In [26]:
cv_svc.best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}

## K Nearest Neibors

In [27]:
k_range = list(range(1, 10))
param_grid_knn = dict(n_neighbors=k_range)

In [28]:
cv_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=10, scoring='accuracy')

In [29]:
cv_knn.fit(X_train,Y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [31]:
cv_knn.best_score_

0.7104272151898734

In [33]:
cv_knn.best_params_

{'n_neighbors': 9}

## Decision Tree

In [34]:
parameters={'min_samples_split' : range(10,500,20),'max_depth': range(1,20,1)}

In [35]:
cv_tree = GridSearchCV(DecisionTreeClassifier(), parameters, cv=5, verbose=1,scoring='accuracy')

In [36]:
cv_tree.fit(X_train,Y_train)

Fitting 5 folds for each of 475 candidates, totalling 2375 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 2375 out of 2375 | elapsed:   10.8s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=None,
             param_grid={'max_depth': range(1, 20),
                         '

In [37]:
cv_tree.best_score_

0.8270833333333334

In [38]:
cv_tree.best_params_


{'max_depth': 5, 'min_samples_split': 10}

In [39]:
dt = cv_tree.best_estimator_

In [40]:
dt.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=5, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [41]:
y_pred_dt = dt.predict(X_val)

In [42]:
print(classification_report(Y_val, y_pred_dt))

              precision    recall  f1-score   support

           0       0.84      0.91      0.87        57
           1       0.81      0.69      0.75        32

    accuracy                           0.83        89
   macro avg       0.83      0.80      0.81        89
weighted avg       0.83      0.83      0.83        89



## Random Forest

In [43]:
param_grid_rf = { 
    'n_estimators': [300, 400, 500, 600, 700, 800],
    'max_features': ['auto', 'sqrt', 'log2']
}

In [44]:
cv_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, verbose=1,scoring='accuracy')

In [45]:
cv_rf.fit(X_train,Y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  1.3min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [46]:
cv_rf.best_score_

0.8120440251572326

In [47]:
rf = cv_rf.best_estimator_

In [48]:
rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='log2',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [49]:
y_pred_rf = rf.predict(X_val)

In [50]:
print(classification_report(Y_val, y_pred_rf))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87        57
           1       0.76      0.78      0.77        32

    accuracy                           0.83        89
   macro avg       0.82      0.82      0.82        89
weighted avg       0.83      0.83      0.83        89



## Conclusion

The logistic regression, the decion tree and the random forest have accuracy above 80%.
The best model is the decision tree.