# Dependencies

In [None]:
# !pip install -q pandas
# !pip install -q --upgrade pip
# !pip install -q matplotlib
# !pip install -q sagemaker

# Imports

In [None]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import pickle
import sagemaker
from sagemaker.xgboost import XGBoost
from sagemaker.mxnet import MXNet
from sagemaker import get_execution_role
from sagemaker.serializers import CSVSerializer
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri

sns.set()

%matplotlib inline

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

## Exploratory Data Analysis

In [None]:
#First 5 rows
train_data.head()

In [None]:
#Displaying columns and object types
train_data.info()

In [None]:
train_data.describe()

In [None]:
#Missing values per column
train_data.isnull().sum()

Auxiliary Functions

In [None]:
#Helpful functions to quickly plot stuff - credits to https://github.com/mngaonkar

def plot_bar_graph(feature, train_data=train_data):
    survived = train_data[train_data['Survived'] == 1][feature].value_counts()
    dead = train_data[train_data['Survived'] == 0][feature].value_counts()
    df = pd.DataFrame([survived, dead])
    df.index = ['Survived', 'Dead']
    df.plot(kind='bar', stacked=False, figsize=(10, 5))
    
def plot_feature_count(feature, train_data=train_data):
    count = train_data[feature].value_counts()
    # mean = train_data[feature].mean()
    # print("mean = ", mean)
    print(count)
    df = pd.DataFrame([count])
    df.index = [feature]
    df.plot(kind='bar')

In [None]:
plot_bar_graph('Sex')

In [None]:
plot_bar_graph('Embarked')

In [None]:
plot_bar_graph('Pclass')

## Feature Engineering 

In [None]:
#moving label to 1st column
first_column = train_data.pop('Survived')
train_data.insert(0, 'Survived', first_column)
train_data.head()

In [None]:
#dropping columns that shouldn't matter much names, ticket numbers, passengerid
train_data.drop(labels=['Name', 'PassengerId', 'Ticket'], axis=1, inplace=True)

#dropping Cabin also for now
train_data.drop(labels=['Cabin'], axis=1, inplace=True)
train_data.head()

In [None]:
# drop NaN rows for embarked
train_data = train_data.dropna(subset=['Embarked'])

# replace age NaN with mean age
mean_age = train_data['Age'].mean()
train_data['Age'] = train_data['Age'].fillna(mean_age)

In [None]:
train_data.isnull().sum()

In [None]:
# round age
train_data['Age'] = train_data['Age'].apply(lambda x : int(x))

# round fare
train_data['Fare'] = train_data['Fare'].apply(lambda x : int(x))

In [None]:
# label encode sex and embarked
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])
train_data['Embarked'] = label_encoder.fit_transform(train_data['Embarked'])

In [None]:
bucket = 'your-bucket'
prefix = 'your_prefix'

In [None]:
train_xgboost, validation_xgboost, test_xgboost = np.split(train_data.sample(frac=1, random_state=23), [int(0.7 * len(train_data)), int(0.9 * len(train_data))])


In [None]:
# remove header as it is not required by XGBoost
train_xgboost.to_csv('train_xgboost.csv', header=False, index=False)
validation_xgboost.to_csv('validation_xgboost.csv', header=False, index=False)
test_xgboost.to_csv('test_xgboost.csv', header=False, index=False)

In [None]:
sagemaker_session = sagemaker.Session()
train_path = sagemaker_session.upload_data(path='train_xgboost.csv', key_prefix='dataset')
validation_path = sagemaker_session.upload_data(path='validation_xgboost.csv', key_prefix='dataset')
test_path = sagemaker_session.upload_data(path='test.csv', key_prefix='test')
print(validation_path)

In [None]:
container = sagemaker.image_uris.retrieve('xgboost', boto3.Session().region_name, version = 'latest')


In [None]:
role = sagemaker.get_execution_role()

In [None]:
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_path, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=validation_path, content_type='csv')
s3_input_test = sagemaker.inputs.TrainingInput(s3_data=test_path, content_type='csv')

In [None]:
estimator = sagemaker.estimator.Estimator(container,
                                          role,
                                          instance_count=1,
                                          instance_type='ml.m5.xlarge',
                                          output_path='s3://{}/{}/output'.format(bucket, prefix),
                                          sagemaker_session=sagemaker_session,
                                          use_spot_instances=True,
                                          max_run=300,
                                          max_wait=360,
                                         )
estimator.set_hyperparameters(eta=0.1,
                             objective='binary:logistic',
                             num_round=25)

In [None]:
estimator.fit({
    'train': s3_input_train,
    'validation': s3_input_validation
})

In [None]:
#this is where the model is
estimator.model_data

In [None]:
#deploy an inference endpoint for real time

xgb_predictor=estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    serializer=CSVSerializer()
)

In [None]:
#Applying same pre processing to test set
#dropping columns that shouldn't matter much names, ticket numbers, passengerid, cabin. ##UPDATE Cabin seems to impact accuracy a lot
test_data.drop(labels=['Name', 'PassengerId', 'Ticket', 'Cabin'], axis=1, inplace=True)
test_data.head()

In [None]:
test_data.isnull().sum()

In [None]:
# label encode sex and embarked
label_encoder = LabelEncoder()
test_data['Sex'] = label_encoder.fit_transform(test_data['Sex'])
test_data['Embarked'] = label_encoder.fit_transform(test_data['Embarked'])

In [None]:
test_data.head()

In [None]:
# replace age NaN with mean age calculated from training set

test_data['Age'] = test_data['Age'].fillna(mean_age)

In [None]:
#there's one nan for fare

mean_fare = train_data['Fare'].mean()
test_data['Fare'] = test_data['Fare'].fillna(mean_fare)

In [None]:
# round age
test_data['Age'] = test_data['Age'].apply(lambda x : int(x))

# round fare
test_data['Fare'] = test_data['Fare'].apply(lambda x : int(x))

In [None]:
test_data.shape


In [None]:
endpoint_name = xgb_predictor.endpoint_name

In [None]:
test_data.head()

In [None]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ""
    for array in split_array:
        predictions = ",".join([predictions, xgb_predictor.predict(array).decode("utf-8")])

    return np.fromstring(predictions[1:], sep=",")


predictions = predict(test_data.to_numpy()[:, 1:])


In [None]:
print(predictions)

In [None]:
def transform(val):
    if val > 0.7:
        return 1
    else:
        return 0
        
predictions = list(map(transform, predictions))
print(predictions)

In [None]:
test_data = pd.read_csv('test.csv')
submission = pd.DataFrame({ 'PassengerId': test_data['PassengerId'],
                            'Survived': predictions })
submission.to_csv("submission.csv", index=False)