In [135]:
import numpy as np
import pandas as pd

In [136]:
df = pd.read_csv("train.csv")

def clean_data(df):
    del df['Ticket']
    del df['Name']
    del df['SibSp']
    del df['Parch']
    del df['Cabin']
    del df['Embarked']

    df['Sex'] = df['Sex'].replace('male', 0).replace('female', 1)
    df = df.rename(columns = {'Sex': 'IsFemale'})
    return df

df = clean_data(df)
df.iloc[0:20]

Unnamed: 0,PassengerId,Survived,Pclass,IsFemale,Age,Fare
0,1,0,3,0,22.0,7.25
1,2,1,1,1,38.0,71.2833
2,3,1,3,1,26.0,7.925
3,4,1,1,1,35.0,53.1
4,5,0,3,0,35.0,8.05
5,6,0,3,0,,8.4583
6,7,0,1,0,54.0,51.8625
7,8,0,3,0,2.0,21.075
8,9,1,3,1,27.0,11.1333
9,10,1,2,1,14.0,30.0708


In [137]:
# Linear regression to fill in missing ages
from sklearn import linear_model

age_prediction_columns = ['Pclass', 'IsFemale', 'Fare']

complete_dataset = df[df['Age'] > 0]
X = complete_dataset[age_prediction_columns]
Y = complete_dataset[['Age']]
age_regressor = linear_model.LinearRegression()
age_regressor.fit(X, Y)

for index, row in df.iterrows():
    if np.isnan(row['Age']):
        estimated_age = age_regressor.predict(row[age_prediction_columns].reshape(1, -1))[0][0]
        df.ix[index, 'Age'] = estimated_age
        
df.iloc[0:20]

Unnamed: 0,PassengerId,Survived,Pclass,IsFemale,Age,Fare
0,1,0,3,0,22.0,7.25
1,2,1,1,1,38.0,71.2833
2,3,1,3,1,26.0,7.925
3,4,1,1,1,35.0,53.1
4,5,0,3,0,35.0,8.05
5,6,0,3,0,26.057135,8.4583
6,7,0,1,0,54.0,51.8625
7,8,0,3,0,2.0,21.075
8,9,1,3,1,27.0,11.1333
9,10,1,2,1,14.0,30.0708


In [143]:
# Train a neural network to estimate survival

from sklearn.neural_network import MLPClassifier

survival_prediction_columns = ['Pclass', 'IsFemale', 'Age', 'Fare']

X = df[survival_prediction_columns]
y = df.as_matrix(['Survived']).T[0]

survival_classifier = MLPClassifier(solver='lbfgs', alpha=0.001, hidden_layer_sizes=(100, 100), random_state=404, learning_rate='adaptive')
survival_classifier.fit(X, y)
print(survival_classifier.score(X, y))

0.793490460157


In [150]:
# Make estimations

test_data = pd.read_csv("test.csv")
test_data['Survived'] = 0 # Default estimation
test_data = clean_data(test_data)
test_data = test_data.rename(columns = {'Sex': 'IsFemale'})

for index, row in test_data.iterrows():
    if np.isnan(row['Age']):
        estimated_age = age_regressor.predict(row[age_prediction_columns].reshape(1, -1))[0][0]
        row['Age'] = estimated_age
    if np.isnan(row['Fare']):
        row['Fare'] = 10
    test_data.ix[index, 'Survived'] = survival_classifier.predict(row[survival_prediction_columns])



In [151]:
# Output
del test_data['Pclass']
del test_data['IsFemale']
del test_data['Age']
del test_data['Fare']

test_data.to_csv('submission.csv', index=False)