In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [32]:
X_train = pd.read_csv('titanic_X_train.csv')
X_test = pd.read_csv('titanic_X_test.csv')
y_train = pd.read_csv('titanic_y_train.csv')
y_test = pd.read_csv('titanic_y_test.csv')

In [67]:
X_train.head(10)

Unnamed: 0,cabin,CabinReduced,sex,cabin_map,CabinReduced_map,sex_map
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
5,0,0,0,0,0,0
6,E36,E,female,1,1,0
7,0,0,0,0,0,0
8,C68,C,male,2,2,1
9,E24,E,male,3,1,1


# RandomForestClassifier

This is meta estimator that fits a number of decision tree classifiers on various sub-samples and averages to improve the predictive accuracy and control over-fitting.

In [57]:
y_train.shape

(916, 1)

In [94]:
from sklearn.ensemble import RandomForestClassifier

#Making model
rf_clf = RandomForestClassifier(n_estimators=200, random_state=42)

#Training model
rf_clf.fit(X_train[['CabinReduced_map', 'sex_map', 'cabin_map']], np.ravel(y_train))

#Prediction
pred_train = rf_clf.predict_proba(X_train[['CabinReduced_map', 'sex_map', 'cabin_map']])
pred_test = rf_clf.predict_proba(X_test[['CabinReduced_map', 'sex_map', 'cabin_map']])

In [78]:
y_scores_train = pred_train[:, 1]
y_scores_test = pred_test[:, 1]

### Scores
As an input to roc_auc_score we take an array of the likelihood of belonging each sample to positive class (survived)

In [79]:
from sklearn.metrics import roc_auc_score

print("Train roc_auc_score score: {} ".format(roc_auc_score(y_train, y_scores_train)))
print("Test roc_auc_score score: {} ".format(roc_auc_score(y_test, y_scores_test)))

Train roc_auc_score score: 0.7221078902480138 
Test roc_auc_score score: 0.4895332116586472 


## 1 classes multiclasses classifier

In [81]:
#Making model
rf_clf1 = RandomForestClassifier(n_estimators=200, random_state=42)

#Training model
rf_clf1.fit(X_train[['CabinReduced_map']], np.ravel(y_train))

#Prediction
pred_train1 = rf_clf1.predict_proba(X_train[['CabinReduced_map']])
pred_test1 = rf_clf1.predict_proba(X_test[['CabinReduced_map']])

In [82]:
y_scores_train1 = pred_train1[:, 1]
y_scores_test1 = pred_test1[:, 1]

### Scores

In [83]:
print("Train roc_auc_score score: {} ".format(roc_auc_score(y_train, y_scores_train1)))
print("Test roc_auc_score score: {} ".format(roc_auc_score(y_test, y_scores_test1)))

Train roc_auc_score score: 0.6372679745797251 
Test roc_auc_score score: 0.5880897074276865 


We can see that our classifier trained on data with fewer features has a worse result for training data than the second model and a better result for the test data, so we can say that our model with more features overfitted to train data cause it was too detailed.

## LogisticRegression
In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. This class implements regularized logistic regression using the ‘liblinear’ library, ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ solvers. Note that regularization is applied by default. It can handle both dense and sparse input.

In [86]:
from sklearn.linear_model import LogisticRegression

#Making model
lr_clf = LogisticRegression(random_state=42)

#Training model
lr_clf.fit(X_train[['CabinReduced_map', 'sex_map', 'cabin_map']], np.ravel(y_train))

#Prediction
y_pred_train = lr_clf.predict(X_train[['CabinReduced_map', 'sex_map', 'cabin_map']])
y_pred_test = lr_clf.predict(X_test[['CabinReduced_map', 'sex_map', 'cabin_map']])

### Scores for 3 classes multiclasses classifier

In [87]:
print("Train roc_auc_score score: {} ".format(roc_auc_score(y_train, y_pred_train)))
print("Test roc_auc_score score: {} ".format(roc_auc_score(y_test, y_pred_test)))

Train roc_auc_score score: 0.6286360502971233 
Test roc_auc_score score: 0.5403600464576075 


## 3 classes multiclasses classifier

In [90]:
#Making model
lr_clf1 = LogisticRegression(random_state=42)

#Training model
lr_clf1.fit(X_train[['cabin_map']], np.ravel(y_train))

#Prediction
y_pred_train1 = lr_clf1.predict(X_train[['cabin_map']])
y_pred_test1 = lr_clf1.predict(X_test[['cabin_map']])

In [91]:
print("Train roc_auc_score score: {} ".format(roc_auc_score(y_train, y_pred_train1)))
print("Test roc_auc_score score: {} ".format(roc_auc_score(y_test, y_pred_test1)))

Train roc_auc_score score: 0.578824488399358 
Test roc_auc_score score: 0.5340965654554504 


# GradientBoostingClassifier

GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage n_classes_ regression trees are fit on the negative gradient of the binomial or multinomial deviance loss function. Binary classification is a special case where only a single regression tree is induced.

In [96]:
from sklearn.ensemble import GradientBoostingClassifier

#Making model
gb_clf = GradientBoostingClassifier(n_estimators = 200, random_state=42)

#Training model
gb_clf.fit(X_train[['CabinReduced_map', 'sex_map', 'cabin_map']], np.ravel(y_train))

#Prediction
pred_train_gb = gb_clf.predict_proba(X_train[['CabinReduced_map', 'sex_map', 'cabin_map']])
pred_test_gb = gb_clf.predict_proba(X_test[['CabinReduced_map', 'sex_map', 'cabin_map']])

In [97]:
y_scores_train_gb = pred_train_gb[:, 1]
y_scores_test_gb = pred_test_gb[:, 1]

In [98]:
print("Train roc_auc_score score: {} ".format(roc_auc_score(y_train, y_scores_train_gb)))
print("Test roc_auc_score score: {} ".format(roc_auc_score(y_test, y_scores_test_gb)))

Train roc_auc_score score: 0.7296303191623185 
Test roc_auc_score score: 0.4765637962502074 


In [100]:
#Making model
gb_clf1 = GradientBoostingClassifier(n_estimators = 200, random_state=42)

#Training model
gb_clf1.fit(X_train[['cabin_map']], np.ravel(y_train))

#Prediction
pred_train_gb1 = gb_clf1.predict_proba(X_train[['cabin_map']])
pred_test_gb1 = gb_clf1.predict_proba(X_test[['cabin_map']])

In [101]:
y_scores_train_gb1 = pred_train_gb1[:, 1]
y_scores_test_gb1 = pred_test_gb1[:, 1]

In [102]:
print("Train roc_auc_score score: {} ".format(roc_auc_score(y_train, y_scores_train_gb1)))
print("Test roc_auc_score score: {} ".format(roc_auc_score(y_test, y_scores_test_gb1)))

Train roc_auc_score score: 0.7038452442650913 
Test roc_auc_score score: 0.563395276809911 


# AdaBoostClassifier
An AdaBoost classifier is a meta-estimator that begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases.

In [106]:
from sklearn.ensemble import AdaBoostClassifier

#Making model
ada_clf = AdaBoostClassifier(n_estimators = 200, random_state=42)

#Training model
ada_clf.fit(X_train[['CabinReduced_map', 'sex_map', 'cabin_map']], np.ravel(y_train))

#Prediction
pred_train_ada = ada_clf.predict_proba(X_train[['CabinReduced_map', 'sex_map', 'cabin_map']])
pred_test_ada = ada_clf.predict_proba(X_test[['CabinReduced_map', 'sex_map', 'cabin_map']])

In [107]:
y_scores_train_ada = pred_train_ada[:, 1]
y_scores_test_ada = pred_test_ada[:, 1]

In [108]:
print("Train roc_auc_score score: {} ".format(roc_auc_score(y_train, y_scores_train_ada)))
print("Test roc_auc_score score: {} ".format(roc_auc_score(y_test, y_scores_test_ada)))

Train roc_auc_score score: 0.6909992502729709 
Test roc_auc_score score: 0.4918422653614291 


In [109]:
#Making model
ada_clf1 = AdaBoostClassifier(n_estimators = 200, random_state=42)

#Training model
ada_clf1.fit(X_train[['cabin_map']], np.ravel(y_train))

#Prediction
pred_train_ada1 = ada_clf1.predict_proba(X_train[['cabin_map']])
pred_test_ada1 = ada_clf1.predict_proba(X_test[['cabin_map']])

In [110]:
y_scores_train_ada1 = pred_train_ada1[:, 1]
y_scores_test_ada1 = pred_test_ada1[:, 1]

In [111]:
print("Train roc_auc_score score: {} ".format(roc_auc_score(y_train, y_scores_train_ada1)))
print("Test roc_auc_score score: {} ".format(roc_auc_score(y_test, y_scores_test_ada1)))

Train roc_auc_score score: 0.6549419087345715 
Test roc_auc_score score: 0.6276616337591948 


# Models comarison
The roc auc score shows us that the best model in our case is AdaBoostClassifier with a small number of features which had the best predictions for the new data. The most overfitted model in our models examination is RandomForestClassifier.

# Difference between predict and predict_proba

predict() method that can be executed on a trained model in order to predict the actual label (or class) over a new set of data.

predict_proba() method that returns the class probabilities for each data point.