In [3]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the data
train_df = pd.read_csv('titanic_train.csv')
test_df = pd.read_csv('titanic_test.csv')

X = ['Pclass','Sex','Age','Fare','Parch']

# Join the features from train and test together before imputing missing values,
# in case their distribution is slightly different
combo_X = train_df[X].append(test_df[X])

# Fare column is just one value empty so we will fill it with median value
combo_X['Fare'] = combo_X['Fare'].fillna(combo_X['Fare'].median())

In [None]:
# Creating probabilty distribution for ages from the existing column values
ages_probabilities = combo_X['Age'].value_counts().to_frame()
ages_probabilities['index1'] = ages_probabilities.index
ages_probabilities = ages_probabilities.rename(columns={'Age': 'Count', 'index1': 'Age'})
ages_probabilities = ages_probabilities.reindex_axis(['Age','Count'], axis=1)
ages_probabilities = ages_probabilities.reset_index()
ages_probabilities = ages_probabilities.drop(["index"],axis=1)
ages_probabilities['Probability'] = ages_probabilities['Count'] / big_X['Age'].value_counts().sum()

input_ages_list = ages_probabilities['Age'].values.tolist()
props_ages_list = ages_probabilities['Probability'].values.tolist()
newAges = np.random.choice(input_ages_list, big_X['Age'].isnull().sum(), props_ages_list)

In [4]:
# fill Ages null values with this distribution
AgeNulls = combo_X[pd.isnull(combo_X['Age'])]
for i, ni in enumerate(AgeNulls.index[:len(newAges)]):
    combo_X['Age'].loc[ni] = newAges[i]

combo_X_imputed = combo_X
# XGBoost doesn't (yet) handle categorical features automatically, so we need to change
# them to columns of integer values.
# See http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing for more
# details and options
le = LabelEncoder()
combo_X_imputed['Sex'] = le.fit_transform(combo_X_imputed['Sex'])

In [None]:
# Prepare the inputs for the model
train_X = combo_X_imputed[0:train_df.shape[0]].as_matrix()
test_X = combo_X_imputed[train_df.shape[0]::].as_matrix()
train_y = train_df['Survived']

gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(train_X, train_y)
predictions = gbm.predict(test_X)

submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'],
                            'Survived': predictions })
print(submission)
#submission.to_csv("submission.csv", index=False)