# Titanic Survival Prediction

Data files are from [https://www.kaggle.com/c/titanic/](https://www.kaggle.com/c/titanic/)

Returns a score on Kaggle of 0.7461 which is around 206th position on leader board

Used as the basis of testing a Streamlit application (code in the same repository as this notebook)

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
base_data = pd.read_csv("../data/train.csv");
base_data.columns

# create a copy of base_data with just the columns we need (i.e. drop PassengerID,Ticket,Name,Cabin)
cols = ["Survived","Pclass","Sex","Age","SibSp","Parch","Fare","Embarked",]
data = base_data[cols].copy()

In [3]:
# Transformations and additions
data["Age"].fillna((data["Age"].mean()), inplace=True) # Fill missing age values with the mean
data["Embarked"].fillna("C", inplace=True) # fill missing Embarked values with C

In [4]:
# Transform labels to target values between 0 and n
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data.loc[:,"Sex"] = encoder.fit_transform(data.loc[:,"Sex"])
# male = 1, female = 0

encoder = LabelEncoder()
data.loc[:,"Embarked"] = encoder.fit_transform(data.loc[:,"Embarked"])
# C = 0, Q = 1, S =2

In [5]:
# Generate a test train split from our input data
x = data.iloc[:,1:8] # all features except cabin and title
y = data.iloc[:,0] # survived

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)


In [6]:
# train models
# --> Ranform Forest
# --> Logistic Regression
# --> Decision Tree

def model(X_train, y_train):
    
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier( n_estimators=10, random_state=0)
    forest.fit(X_train,y_train)
    print("Forest score {0}".format(forest.score(X_train,y_train)) )
    
    from sklearn.linear_model import LogisticRegression
    lreg =LogisticRegression()
    lreg.fit(X_train,y_train)
    print("Logistic Regression score {0}".format(lreg.score(X_train,y_train)) )
  
    from sklearn.tree import DecisionTreeClassifier
    tree =DecisionTreeClassifier()
    tree.fit(X_train,y_train)
    print("Decisions Tree score {0}".format(tree.score(X_train,y_train)) )
  
   
    return forest, lreg, tree

forest, lreg, tree = model(X_train,y_train)

Forest score 0.9676966292134831
Logistic Regression score 0.797752808988764
Decisions Tree score 0.9817415730337079


In [7]:
# Run test data through model and get accuracy score
from sklearn.metrics import accuracy_score

target_names=["Died","Survived"]

y_predict = forest.predict(X_test)
print("Random Forest {0}".format(accuracy_score(y_test, y_predict)))

y_predict = lreg.predict(X_test)
print("Logistic Regresion {0}".format(accuracy_score(y_test, y_predict)))

y_predict = tree.predict(X_test)
print("Decision Tree {0}".format(accuracy_score(y_test, y_predict)))

Random Forest 0.8324022346368715
Logistic Regresion 0.7988826815642458
Decision Tree 0.7821229050279329


In [8]:
# Run a prediction on a set of values
my_data =[
            [
             1,  #"Pclass"
             0,  #"Sex", Sex 0 = Female, 1 = Male
             20,  #"Age", Age
             0,  #"SibSp", Siblings and Spouses
             0,  #"Parch", Parents and Children
             0,  #"Fare", 
             2,  #"Embarked" 2 = Southampton
    ]
]

forest.predict(my_data)

array([1])

In [9]:
# Export model
import pickle
filename = "../data/model.sv"
pickle.dump(forest, open(filename,'wb'))