In [1]:
import numpy as np
import pandas as pd

In [2]:
data_test = pd.read_csv('datasets/test.csv')
data_train = pd.read_csv('datasets/train.csv')

In [3]:
data_train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
data_test.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q


In [5]:
# select relevant features
X_train = data_train.iloc[:, [2,4,5,6,7,9,10,11]]
X_test = data_test.iloc[:, [1,3,4,5,6,8,9,10]]

In [6]:
X_train.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.925,,S


In [7]:
X_test.head(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,34.5,0,0,7.8292,,Q
1,3,female,47.0,1,0,7.0,,S
2,2,male,62.0,0,0,9.6875,,Q


In [8]:
# output of train data
y_train = data_train.iloc[:,1]

In [9]:
# copy data to avoid modifying
X_train_prep = X_train.copy()
X_test_prep = X_test.copy()

In [10]:
# print columns with type 'object'
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)
print('Columns with type string: ',object_cols)

Columns with type string:  ['Sex', 'Cabin', 'Embarked']


In [11]:
# print columns which contain null values
hasnull_cols = [col for col in X_train.columns if X_train[col].isnull().any()]
print('Columns contain null values: ', hasnull_cols)

Columns contain null values:  ['Age', 'Cabin', 'Embarked']


In [12]:
# percentage of null values over each column
X_train[hasnull_cols].isnull().sum() / len(X_train) * 100

Age         19.865320
Cabin       77.104377
Embarked     0.224467
dtype: float64

In [13]:
# drop Cabin features due to high proportion of null values
X_train_prep.drop(columns='Cabin', inplace=True)
X_test_prep.drop(columns='Cabin', inplace=True)

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [15]:
imputer = SimpleImputer()
enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [16]:
# imputation for Age
X_train_prep['Age'] = pd.Series(imputer.fit_transform(X_train[['Age']]).reshape(-1))
X_test_prep['Age'] = pd.Series(imputer.transform(X_test[['Age']]).reshape(-1))

In [17]:
# Set null values of Embarked to string so that all the values are at the same string type
null_embarked_idx_train = X_train[X_train['Embarked'].isnull()].index
null_embarked_idx_test = X_test[X_test['Embarked'].isnull()].index
X_train_prep.loc[null_embarked_idx_train, 'Embarked'] = 'nan'
X_test_prep.loc[null_embarked_idx_test, 'Embarked'] = 'nan'

In [18]:
# one-hot encoding for Sex and Embarked
OH_X_train_cols = pd.DataFrame(enc.fit_transform(X_train_prep[['Sex', 'Embarked']]))
OH_X_test_cols = pd.DataFrame(enc.transform(X_test_prep[['Sex', 'Embarked']]))

In [19]:
# after one-hot encoding for Sex and Embarked, drop these 2 columns
X_train_prep.drop(columns=['Sex', 'Embarked'], inplace=True)
OH_X_train = pd.concat([X_train_prep, OH_X_train_cols], axis=1)
X_test_prep.drop(columns=['Sex', 'Embarked'], inplace=True)
OH_X_test = pd.concat([X_test_prep, OH_X_test_cols], axis=1)

In [20]:
# in the test set, Fare column has null value, just fill it with an arbitrary value
OH_X_test['Fare'].fillna(0, inplace=True)

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [22]:
# use XGBoose model
model = XGBRegressor()
model.fit(OH_X_train, y_train)
y_pred = np.round(model.predict(OH_X_test)) # XGBoost outputs float, so round each values to the nearest int
y_pred = y_pred.astype('int64')  # convert type from float to int
# y_pred = model.predict(OH_X_test)



  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [23]:
# save the result file
output = pd.DataFrame({'Survived': y_pred}, index=data_test['PassengerId'])
output.to_csv('output/output.csv')