# **Titanic Kaggle Competition Predictions**

# Problem description

The sinking of the Titanic is one of the most infamous shipwrecks in history.

On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.

While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.

In this challenge, the goal is to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).

**Dataset features:**

Survived: Outcome of survival (0 = No; 1 = Yes)

Pclass: Socio-economic class (1 = Upper class; 2 = Middle class; 3 = Lower class)

Name: Name of passenger

Sex: Sex of the passenger

Age: Age of the passenger (Some entries contain NaN)

SibSp: Number of siblings and spouses of the passenger aboard

Parch: Number of parents and children of the passenger aboard

Ticket: Ticket number of the passenger

Fare: Fare paid by the passenger

Cabin Cabin number of the passenger (Some entries contain NaN)

Embarked: Port of embarkation of the passenger (C = Cherbourg; Q = Queenstown; S = Southampton)


# Importing the libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import math
import xgboost
import matplotlib.pyplot as plt
import warnings
import seaborn as sn
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing the data

In [None]:
training_set = pd.read_csv('/kaggle/input/titanic/train.csv')
test_set = pd.read_csv('/kaggle/input/titanic/test.csv')

# Data exploration

In [None]:
training_set.head()

In [None]:
test_set.head()

# Detecting and dealing with unknown data

In [None]:
# filling unknown data with mean/median

print(training_set.isnull().sum())

In [None]:
training_set['Age'].fillna(training_set['Age'].median(), inplace=True)
training_set['Embarked'].fillna(training_set['Embarked'].mode()[0], inplace=True)

# Removing columns

Name and Ticket columns - a lot of different distinct data
Cabin column - too many NA values

In [None]:
training_set.drop(columns=['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_set.drop(columns=['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

# Detecting and removing outliers

Reducing the noise of the data

In [None]:
def box_plot(column):
    training_set.boxplot(by = "Survived",column = [column],grid = True)
    
# Small circles or unfilled dots are drawn on the chart to indicate where suspected outliers lie.
# Filled circles are used for known outliers.

In [None]:
box_plot("Pclass")

In [None]:
box_plot("SibSp")

In [None]:
training_set["SibSp"].value_counts()
indexes = training_set.index[training_set.SibSp == 8]
training_set.drop(indexes, inplace=True)

In [None]:
box_plot("Parch")

In [None]:
training_set["Parch"].value_counts()

In [None]:
indexes = training_set.index[training_set.Parch == 6]
training_set.loc[indexes]
training_set.drop(indexes, inplace=True)

In [None]:
box_plot("Fare")

In [None]:
training_set["Fare"].value_counts()

In [None]:
indexes = training_set.index[training_set.Fare > 100]
training_set.loc[indexes]
indexes
training_set.drop(indexes, inplace=True)

# Correlation matrix

Used to detect high correlation between features in order to reduce dimensionality if possible

In [None]:
correlation_matrix = training_set.corr()
sn.heatmap(correlation_matrix, annot = True)

In [None]:
test_set.head()

1. Survival rate PClass

In [None]:
training_set.head()

In [None]:
classes = training_set.Pclass.unique()
classes.sort()
survival_rate_classes = []
# .loc - get rows by the name of a column

for i in classes:
    class_i = training_set.loc[training_set.Pclass == i]["Survived"]
    survival_rate_classes.append(round(sum(class_i)/len(class_i),2))

print(classes)
print('Survival rate classes: ')
print(survival_rate_classes)


plt.bar(classes, survival_rate_classes)
plt.title("Survival Rate - Passenger Class")
plt.xlabel("Class")
plt.ylabel("Survival Rate")
plt.show()

2. Survival rate per Sex

In [None]:
sex = training_set.Sex.unique()
survival_rate_sex = []

for i in sex:
    sex_i = training_set.loc[training_set.Sex == i]["Survived"]
    survival_rate_sex.append(round(sum(sex_i)/len(sex_i),2))
    
print('Survival rates: ')
print(survival_rate_sex)

plt.bar(sex, survival_rate_sex)
plt.title("Survival Rate - Passenger Sexes")
plt.xlabel("Sex")
plt.ylabel("Survival Rate")
plt.show()

3. Survival rate per Age bucket

In [None]:
max_age = max(training_set.Age)
AgeGroup = pd.cut(round(training_set.Age), bins = 5, labels=["1", "2", "3", "4", "5"], ordered = True)
print('Age categories: ')
age_categories = AgeGroup.unique()

survival_rate_age = []

for i in range(1,6):
    cat = training_set.loc[AgeGroup == str(i)]["Survived"]
    survival_rate_age.append(round(sum(cat)/len(cat),2))
    print(i)
    print(survival_rate_age)

4. Survival rate by # of siblings / spouses aboard the Titanic

In [None]:
sibsp_categories = []
sibsp_categories = training_set.Parch.unique()
sibsp_categories.sort()

survival_rate_sibsp = []

for i in sibsp_categories:
    cat = training_set.loc[training_set.Parch == i]["Survived"]
    survival_rate_sibsp.append(round(sum(cat)/len(cat),2))

plt.bar(sibsp_categories, survival_rate_sibsp)
plt.title("Survival Rate - Number of Siblings/Spouses aboard")
plt.xlabel("Number of Siblings/Spouses")
plt.ylabel("Survival Rate")
plt.show()

5. Survival rate by # of parents / children aboard the Titanic

In [None]:
parch_categories = []
parch_categories = training_set.SibSp.unique()
parch_categories.sort()

survival_rate_parch = []

for i in parch_categories:
    cat = training_set.loc[training_set.SibSp == i]["Survived"]
    survival_rate_parch.append(round(sum(cat)/len(cat),2))

plt.bar(parch_categories, survival_rate_parch)
plt.title("Survival Rate - Number of Parents/Children aboard")
plt.xlabel("Number of Parents/Children")
plt.ylabel("Survival Rate")
plt.show()

6. Fare

In [None]:
# Fare - feature scaling

7. Port of Embarkation

# Categorical data

In [None]:
training_set = pd.get_dummies(training_set, columns=['Pclass', 'Sex', 'Embarked' ], drop_first= True)
test_set = pd.get_dummies(test_set, columns=['Pclass', 'Sex', 'Embarked' ], drop_first= True)

# Creating the matrix of features and dependent variable vector

In [None]:
x_train = training_set.iloc[:,training_set.columns != 'Survived']
y_train = training_set.iloc[:,training_set.columns == 'Survived'].values.reshape(-1,1)

x_test = test_set

# Standard scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
features_to_scale = ['Age','SibSp','Parch','Fare']

x_train[features_to_scale] = scaler.fit_transform(x_train[features_to_scale])
x_test[features_to_scale] = scaler.fit_transform(x_test[features_to_scale])

****

# Fitting the model

# Support Vector Machine
Random Forest
Extreme Gradient Boosting

In [None]:
from sklearn.svm import SVC
classifier_svm = SVC(kernel = 'linear', random_state = 0)

print(x_train)
classifier_svm.fit(x_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_rf = RandomForestClassifier(random_state = 0)
classifier_rf.fit(x_train, y_train)

In [None]:
from xgboost import XGBClassifier
classifier_xgb = XGBClassifier()
classifier_xgb.fit(x_train, y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier_lr = LogisticRegression()
classifier_lr.fit(x_train, y_train)

# Predicting the test set results

In [None]:
y_pred_svm = classifier_svm.predict(x_test)
y_pred_rf = classifier_rf.predict(x_test)
y_pred_xgb = classifier_xgb.predict(x_test)
y_pred_lr = classifier_lr.predict(x_test)

# Confusion matrix and accuracy

In [None]:
#from sklearn.metrics import confusion_matrix, accuracy_score
#cm = confusion_matrix(y_test, y_pred)
#print(cm)
#accuracy = accuracy_score(y_test, y_pred)
#print(accuracy)

# Output

In [None]:
#output = pd.DataFrame({'PassengerId': test_set.PassengerId, 'Survived': y_pred_svm})

#output = pd.DataFrame({'PassengerId': test_set.PassengerId, 'Survived': y_pred_rf})

output = pd.DataFrame({'PassengerId': test_set.PassengerId, 'Survived': y_pred_xgb})

#output = pd.DataFrame({'PassengerId': test_set.PassengerId, 'Survived': y_pred_lr})
output.to_csv('my_submission_xgb.csv', index=False)
print("Your submission was successfully saved!")