# Titanic Survival Predictions
## Titanic passanger survival predictions based on Kaggle's Titanic dataset

In [1]:
import numpy
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
uncleaned_training_set = pandas.read_csv('./data/train.csv')
uncleaned_test_set = pandas.read_csv('./data/test.csv')

In [3]:
uncleaned_training_set

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
uncleaned_test_set

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [5]:
(uncleaned_test_set['Embarked'] == 'Q').astype(int)

0      1
1      0
2      1
3      0
4      0
      ..
413    0
414    0
415    0
416    0
417    0
Name: Embarked, Length: 418, dtype: int64

In [6]:
uncleaned_test_set.corr()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.026751,-0.034102,0.003818,0.04308,0.008211
Pclass,-0.026751,1.0,-0.492143,0.001087,0.018721,-0.577147
Age,-0.034102,-0.492143,1.0,-0.091587,-0.061249,0.337932
SibSp,0.003818,0.001087,-0.091587,1.0,0.306895,0.171539
Parch,0.04308,0.018721,-0.061249,0.306895,1.0,0.230046
Fare,0.008211,-0.577147,0.337932,0.171539,0.230046,1.0


### Notes on multicolinearity
We leave out the column Embarked_C because it can be derived from columns Embarked_Q and Embarked_S. If Embarked_Q = 0 and Embarked_S = 0, then we know Embarked_C = 1. If we include a column for Embarked_C, then we will have linearly dependent columns, and there would be infinitely many sets of weight values that would give exactly the same prediction probabilities. This is called multicollinearity, and it can make the model weights less interpretable.

#### Example
If survival was 100% from Q and S, and 0% from C, the following model parameters (bias b and weights w<sub>Q</sub>, w<sub>S</sub>, and w<sub>C</sub>) would result in the same prediction probabilities:
- w<sub>Q</sub> = 1, w<sub>S</sub> = 1, w<sub>C</sub> = 0, b = 0
    - Embarked_Q = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 1
    - Embarked_S = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 1
    - Embarked_C = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 0
- w<sub>Q</sub> = 0, w<sub>S</sub> = 0, w<sub>C</sub> = -1, b = 1
    - Embarked_Q = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 1
    - Embarked_S = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 1
    - Embarked_C = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 0

In [7]:
def get_numeric_sex(row):
    sex = row.Sex
    if sex == 'female':
        return 1
    if sex == 'male':
        return 0
    else:
        raise('Sex must be male or female')

def get_cleaned_dataset(uncleaned_set):
    cleaned_set = uncleaned_set.copy()
    cleaned_set['Sex'] = cleaned_set.apply(get_numeric_sex, axis=1)
    cleaned_set['Embarked_Q'] = (uncleaned_set['Embarked'] == 'Q').astype(int)
    cleaned_set['Embarked_S'] = (uncleaned_set['Embarked'] == 'S').astype(int)
    # If Embarked_Q = 0 and Embarked_S = 0, then passenger embarked at location C

    
    # todo: handle age and fare next (both have missing values)
    cleaned_set.drop(columns=['PassengerId', 'Embarked', 'Ticket', 'Cabin', 'Name', 'Age', 'Fare'], inplace=True)
    if 'Survived' in uncleaned_set.columns:
        cleaned_set.drop(columns=['Survived'], inplace=True)

    return cleaned_set


test_set = get_cleaned_dataset(uncleaned_test_set)
training_set = get_cleaned_dataset(uncleaned_training_set)
training_targets = uncleaned_training_set['Survived']

training_set


Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked_Q,Embarked_S
0,3,0,1,0,0,1
1,1,1,1,0,0,0
2,3,1,0,0,0,1
3,1,1,1,0,0,1
4,3,0,0,0,0,1
...,...,...,...,...,...,...
886,2,0,0,0,0,1
887,1,1,0,0,0,1
888,3,1,1,2,0,1
889,1,0,0,0,0,0


In [8]:
"""# check to make sure all cells have integer values
check_data_set = training_set

columns = list(check_data_set.columns)
missing_data = False

for column in columns:
    column_data = check_data_set[column].tolist()
    if any((numpy.isnan(elem) or not isinstance(elem, int)) for elem in column_data):
        missing_data = True
        
missing_data"""

'# check to make sure all cells have integer values\ncheck_data_set = training_set\n\ncolumns = list(check_data_set.columns)\nmissing_data = False\n\nfor column in columns:\n    column_data = check_data_set[column].tolist()\n    if any((numpy.isnan(elem) or not isinstance(elem, int)) for elem in column_data):\n        missing_data = True\n        \nmissing_data'

In [9]:
def predict(model, test_set):
    test_predictions = model.predict(test_set)
    test_probabilities = model.predict_proba(test_set)
    test_set_with_predictions = test_set.copy()
    test_set_with_predictions.insert(column="Survival_Prediction", value=test_predictions, loc=len(test_set_with_predictions.columns))
    test_set_with_predictions.insert(column="Survival_Probability_Prediction", value=test_probabilities[:,1], loc=len(test_set_with_predictions.columns))
    return test_set_with_predictions

In [10]:
# set up model to predict survival from training features
logistic_regression_model = LogisticRegression(random_state=0)
logistic_regression_model.fit(training_set, training_targets)

test_set_with_predictions = predict(logistic_regression_model, test_set)
test_set_with_predictions

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked_Q,Embarked_S,Survival_Prediction,Survival_Probability_Prediction
0,3,0,0,0,1,0,0,0.134482
1,3,1,1,0,0,1,1,0.560465
2,2,0,0,0,1,0,0,0.275327
3,3,0,0,0,0,1,0,0.102014
4,3,1,1,1,0,1,1,0.552641
...,...,...,...,...,...,...,...,...
413,3,0,0,0,0,1,0,0.102014
414,1,1,0,0,0,0,1,0.938577
415,3,0,0,0,0,1,0,0.102014
416,3,0,0,0,0,1,0,0.102014


In [11]:
logistic_regression_model.coef_ # weights = [-0.89414006,  2.62642792, -0.20833906, -0.03170161, -0.17377863, -0.48691802]
# These weights show that the passenger's sex is the main determinant of the model's survival prediction

array([[-0.89414006,  2.62642792, -0.20833906, -0.03170161, -0.17377863,
        -0.48691802]])

In [12]:
random_forest_model = RandomForestClassifier(random_state=0)
random_forest_model.fit(training_set, training_targets)

random_forest_predictions = predict(random_forest_model, test_set)
random_forest_predictions

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked_Q,Embarked_S,Survival_Prediction,Survival_Probability_Prediction
0,3,0,0,0,1,0,0,0.071118
1,3,1,1,0,0,1,0,0.396841
2,2,0,0,0,1,0,0,0.037152
3,3,0,0,0,0,1,0,0.124354
4,3,1,1,1,0,1,1,0.577311
...,...,...,...,...,...,...,...,...
413,3,0,0,0,0,1,0,0.124354
414,1,1,0,0,0,0,1,0.952335
415,3,0,0,0,0,1,0,0.124354
416,3,0,0,0,0,1,0,0.124354


In [13]:
logistic_regression_model.score(training_set, training_targets) # accuracy on training set = .7912

0.7912457912457912

In [14]:
random_forest_model.score(training_set, training_targets) # accuracy on training set = .8373

0.8372615039281706