# Titanic Survival Predictions
## Titanic passanger survival predictions based on Kaggle's Titanic dataset

In [1]:
import numpy
import pandas
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
uncleaned_data_set = pandas.read_csv('./data/train.csv')

uncleaned_training_set, uncleaned_validation_set = train_test_split(uncleaned_data_set, test_size=0.2)

In [3]:
uncleaned_training_set

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
207,208,1,3,"Albimona, Mr. Nassef Cassem",male,26.0,0,0,2699,18.7875,,C
215,216,1,1,"Newell, Miss. Madeleine",female,31.0,1,0,35273,113.2750,D36,C
370,371,1,1,"Harder, Mr. George Achilles",male,25.0,1,0,11765,55.4417,E50,C
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
532,533,0,3,"Elias, Mr. Joseph Jr",male,17.0,1,1,2690,7.2292,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
173,174,0,3,"Sivola, Mr. Antti Wilhelm",male,21.0,0,0,STON/O 2. 3101280,7.9250,,S
48,49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
832,833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C


In [4]:
uncleaned_validation_set

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
486,487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35.0,1,0,19943,90.0000,C93,S
217,218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42.0,1,0,243847,27.0000,,S
741,742,0,1,"Cavendish, Mr. Tyrell William",male,36.0,1,0,19877,78.8500,C46,S
205,206,0,3,"Strom, Miss. Telma Matilda",female,2.0,0,1,347054,10.4625,G6,S
335,336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
546,547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19.0,1,0,2908,26.0000,,S
494,495,0,3,"Stanley, Mr. Edward Roland",male,21.0,0,0,A/4 45380,8.0500,,S
278,279,0,3,"Rice, Master. Eric",male,7.0,4,1,382652,29.1250,,Q
787,788,0,3,"Rice, Master. George Hugh",male,8.0,4,1,382652,29.1250,,Q


In [5]:
uncleaned_training_set.corr()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,0.007017,-0.034321,0.047203,-0.051045,0.000485,0.023896
Survived,0.007017,1.0,-0.310666,-0.082069,-0.034688,0.080189,0.233435
Pclass,-0.034321,-0.310666,1.0,-0.372397,0.071172,0.005411,-0.543608
Age,0.047203,-0.082069,-0.372397,1.0,-0.307843,-0.215733,0.099074
SibSp,-0.051045,-0.034688,0.071172,-0.307843,1.0,0.427707,0.14808
Parch,0.000485,0.080189,0.005411,-0.215733,0.427707,1.0,0.219366
Fare,0.023896,0.233435,-0.543608,0.099074,0.14808,0.219366,1.0


### Notes on multicolinearity
We leave out the column Embarked_C because it can be derived from columns Embarked_Q and Embarked_S. If Embarked_Q = 0 and Embarked_S = 0, then we know Embarked_C = 1. If we include a column for Embarked_C, then we will have linearly dependent columns, and there would be infinitely many sets of weight values that would give exactly the same prediction probabilities. This is called multicollinearity, and it can make the model weights less interpretable.

#### Example
If survival was 100% from Q and S, and 0% from C, the following model parameters (bias b and weights w<sub>Q</sub>, w<sub>S</sub>, and w<sub>C</sub>) would result in the same prediction probabilities:
- w<sub>Q</sub> = 1, w<sub>S</sub> = 1, w<sub>C</sub> = 0, b = 0
    - Embarked_Q = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 1
    - Embarked_S = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 1
    - Embarked_C = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 0
- w<sub>Q</sub> = 0, w<sub>S</sub> = 0, w<sub>C</sub> = -1, b = 1
    - Embarked_Q = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 1
    - Embarked_S = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 1
    - Embarked_C = 1 => w<sub>Q</sub>(Embarked_Q) + w<sub>S</sub>(Embarked_S) + w<sub>C</sub>(Embarked_C) + b = 0

In [6]:
def get_numeric_sex(row):
    sex = row.Sex
    if sex == 'female':
        return 1
    if sex == 'male':
        return 0
    else:
        raise('Sex must be male or female')

def get_cleaned_dataset(uncleaned_set):
    cleaned_set = uncleaned_set.copy()
    cleaned_set['Sex'] = cleaned_set.apply(get_numeric_sex, axis=1)
    cleaned_set['Embarked_Q'] = (uncleaned_set['Embarked'] == 'Q').astype(int)
    cleaned_set['Embarked_S'] = (uncleaned_set['Embarked'] == 'S').astype(int)
    # If Embarked_Q = 0 and Embarked_S = 0, then passenger embarked at location C

    
    # todo: handle age and fare next (both have missing values)
    cleaned_set.drop(columns=['PassengerId', 'Embarked', 'Ticket', 'Cabin', 'Name', 'Age', 'Fare'], inplace=True)
    if 'Survived' in uncleaned_set.columns:
        cleaned_set.drop(columns=['Survived'], inplace=True)

    return cleaned_set


training_set = get_cleaned_dataset(uncleaned_training_set)
training_targets = uncleaned_training_set['Survived']

training_set


Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked_Q,Embarked_S
207,3,0,0,0,0,0
215,1,1,1,0,0,0
370,1,0,1,0,0,0
31,1,1,1,0,0,0
532,3,0,1,1,0,0
...,...,...,...,...,...,...
173,3,0,0,0,0,1
48,3,0,2,0,0,0
879,1,1,0,1,0,0
832,3,0,0,0,0,0


In [7]:
"""# check to make sure all cells have integer values
check_data_set = training_set

columns = list(check_data_set.columns)
missing_data = False

for column in columns:
    column_data = check_data_set[column].tolist()
    if any((numpy.isnan(elem) or not isinstance(elem, int)) for elem in column_data):
        missing_data = True
        
missing_data"""

'# check to make sure all cells have integer values\ncheck_data_set = training_set\n\ncolumns = list(check_data_set.columns)\nmissing_data = False\n\nfor column in columns:\n    column_data = check_data_set[column].tolist()\n    if any((numpy.isnan(elem) or not isinstance(elem, int)) for elem in column_data):\n        missing_data = True\n        \nmissing_data'

In [8]:
def predict(model, feature_set):
    predictions = model.predict(feature_set)
    probabilities = model.predict_proba(feature_set)
    feature_set_with_predictions = feature_set.copy()
    feature_set_with_predictions.insert(column="Survival_Prediction", value=predictions, loc=len(feature_set_with_predictions.columns))
    feature_set_with_predictions.insert(column="Survival_Probability_Prediction", value=probabilities[:,1], loc=len(feature_set_with_predictions.columns))
    return feature_set_with_predictions


In [9]:
# set up model to predict survival from training features
logistic_regression_model = LogisticRegression(random_state=0)
logistic_regression_model.fit(training_set, training_targets)

LogisticRegression(random_state=0)

In [10]:
logistic_regression_model.coef_ # weights = [-0.89414006,  2.62642792, -0.20833906, -0.03170161, -0.17377863, -0.48691802]
# These weights show that the passenger's sex is the main determinant of the model's survival prediction

array([[-0.81816987,  2.56530084, -0.23775671, -0.01118989, -0.01584067,
        -0.5636224 ]])

In [11]:
random_forest_model = RandomForestClassifier(random_state=0)
random_forest_model.fit(training_set, training_targets)

RandomForestClassifier(random_state=0)

In [12]:
logistic_regression_model.score(training_set, training_targets) # accuracy on training set = .8020

0.7865168539325843

In [13]:
random_forest_model.score(training_set, training_targets) # accuracy on training set = .8399

0.8370786516853933

In [14]:
validation_set = get_cleaned_dataset(uncleaned_validation_set)
validation_targets = uncleaned_validation_set['Survived']

In [15]:
logistic_validation_predictions = predict(logistic_regression_model, validation_set)
logistic_validation_predictions['Survived_Actual'] = (uncleaned_validation_set['Survived'])
logistic_validation_predictions

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked_Q,Embarked_S,Survival_Prediction,Survival_Probability_Prediction,Survived_Actual
486,1,1,1,0,0,1,1,0.871114,1
217,2,0,1,0,0,1,0,0.186544,0
741,1,0,1,0,0,1,0,0.341987,0
205,3,1,0,1,0,1,1,0.622712,0
335,3,0,0,0,0,1,0,0.113746,0
...,...,...,...,...,...,...,...,...,...
546,2,1,1,0,0,1,1,0.748886,1
494,3,0,0,0,0,1,0,0.113746,0
278,3,0,4,1,1,0,0,0.078170,0
787,3,0,4,1,1,0,0,0.078170,0


In [16]:
logistic_regression_model.score(validation_set, validation_targets) # accuracy on validation set = .7654

0.7988826815642458

In [17]:
random_forest_predictions = predict(random_forest_model, validation_set)
random_forest_predictions['Survived_Actual'] = (uncleaned_validation_set['Survived'])
random_forest_predictions

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked_Q,Embarked_S,Survival_Prediction,Survival_Probability_Prediction,Survived_Actual
486,1,1,1,0,0,1,1,1.000000,1
217,2,0,1,0,0,1,0,0.000000,0
741,1,0,1,0,0,1,0,0.422128,0
205,3,1,0,1,0,1,1,0.982220,0
335,3,0,0,0,0,1,0,0.132724,0
...,...,...,...,...,...,...,...,...,...
546,2,1,1,0,0,1,1,0.998947,1
494,3,0,0,0,0,1,0,0.132724,0
278,3,0,4,1,1,0,0,0.103833,0
787,3,0,4,1,1,0,0,0.103833,0


In [18]:
random_forest_model.score(validation_set, validation_targets) # accuracy on validation set = .8212

0.7988826815642458

In [19]:
uncleaned_test_set = pandas.read_csv('./data/test.csv')
uncleaned_test_set

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S
