Predicting the survival probability of the passengers of the Titanic ship with the decision tree algorithm

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
plt.rc("font", size=14)
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
data = pd.read_excel('titanic.xlsx')
data = data.dropna()
print(data.shape)
data.head()

In [None]:
cols = ['Survived', 'Pclass', 'Sex', 'Age', 'Fare']
data = data[cols]
data.head()

In [None]:
data['Survived'].value_counts()

In [None]:
sns.countplot(x='Survived', data=data, palette='hls')
plt.show()

In [None]:
count_no_sur = len(data[data['Survived'] == 0])
count_sur = len(data[data['Survived'] == 1])

pct_of_no_sur = count_no_sur / (count_no_sur + count_sur)
pct_of_sur = count_sur / (count_no_sur + count_sur)

print('Percentage of not survived people: ', pct_of_no_sur*100)
print('Percentage of survived people: ', pct_of_sur*100)

In [None]:
data.groupby(['Sex', 'Pclass'])['Survived'].aggregate('mean').unstack()

In [None]:
age = pd.cut(data['Age'], [0, 18, 80])
data.pivot_table('Survived', ['Sex', age], 'Pclass')

In [None]:
fare = pd.qcut(data['Fare'], 2)
data.pivot_table('Survived', ['Sex', age, fare])

In [None]:
data.pivot_table(index='Sex', columns='Pclass', aggfunc={'Survived':sum})

In [None]:
# creating dummy vars

cat_list = 'var_Sex'
cat_list = pd.get_dummies(data['Sex'], prefix='Sex')
data1 = data.join(cat_list)
data = data1

cat_vars = ['Sex']
data_vars = data.columns.values.tolist()
to_keep = [i for i in data_vars if i not in cat_vars]
data_final = data[to_keep]

In [41]:
cols = ['Pclass', 'Age', 'Fare', 'Sex_female', 'Sex_male']
y = data_final['Survived']
x = data_final[cols]

In [42]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.3)

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred = logreg.predict(x_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(x_test, y_test)))

In [None]:
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix: \n', cm)

In [None]:
prfs = precision_recall_fscore_support(y_test, y_pred)
print('Precision: \n', prfs[0])
print('Recall: \n', prfs[1])
print('F-Score: \n', prfs[2])
print('Support: \n', prfs[3])