In [None]:
# library imports
import numpy as np
import pandas as pd
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from copy import copy

In [None]:
# data imports
train_original = pd.read_csv('C:/Users/Andrew Mark/Google Drive/Projects/Kaggle/titanic/train.csv')
test_original = pd.read_csv('C:/Users/Andrew Mark/Google Drive/Projects/Kaggle/titanic/test.csv')
answers = pd.read_csv('C:/Users/Andrew Mark/Google Drive/Projects/Kaggle/titanic/gender_submission.csv')

train = train_original
test = copy(test_original)

train_original.head()

In [None]:
# add the Survived column to the test dataset
test['Survived'] = answers['Survived'].values

In [None]:
# train dataset info
train_original.info()

In [None]:
# check for null/empty values in train
train_original.isnull().sum()

In [None]:
# test dataset info
test_original.info()

In [None]:
# check for null/empty values in test
test_original.isnull().sum()

In [None]:
# data cleansing/pre-processing

# replace Embarked with mode ('S')
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace = True)

# fill missing values for Age by the mean
train['Age'].fillna(train['Age'].median(), inplace = True)
test['Age'].fillna(test['Age'].median(), inplace = True)    
    
# too many rows have Cabin missing, so it might be better to drop it
train.drop('Cabin', axis=1, inplace = True)
test.drop('Cabin', axis=1, inplace=True)

# add the 1 missing value for Fare in test dataset - using the median
test['Fare'].fillna(test['Fare'].median(), inplace = True)

# create bins for Fare
train['NewFare'] = pd.cut(train['Fare'], 5)
test['NewFare'] = pd.cut(train['Fare'], 5)

# create bins for Age
train['NewAge'] = pd.cut(train['Age'], 5)
test['NewAge'] = pd.cut(test['Age'], 5)

# convert bins of NewAge and NewFare to int
train['NewAge'] = pd.get_dummies(train['NewAge'], columns = ['NewAge'], prefix=['Int'])
test['NewAge'] = pd.get_dummies(test['NewAge'], columns = ['NewAge'], prefix=['Int'])

train['NewFare'] = pd.get_dummies(train['NewFare'], columns = ['NewFare'], prefix=['Int'])
test['NewFare'] = pd.get_dummies(test['NewFare'], columns = ['NewFare'], prefix=['Int'])

# convert Embarked to int
train['Embarked'] = train['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
test['Embarked'] = test['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)

# convert Sex to int
train['Sex'] = train['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
test['Sex'] = test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

# ignoring a few columns for now, aka laziness
columns_drop = ['PassengerId', 'Name', 'Ticket', 'SibSp', 'Parch', 'Fare', 'Age']
train.drop(columns_drop, axis = 1, inplace = True)
test.drop(columns_drop, axis = 1, inplace = True)

# chech null counts and columns
print('\n\tTrain Dataset')
print(train.isnull().sum())
print('\n\tTest Dataset')
print(test.isnull().sum())

In [None]:
# final check 
# integer values are needed for RandomForest to work .. not sure about other algorithms
train.head(3)

In [None]:
# separate features and labels for train and test datasets
X_train=train[['Pclass', 'Sex', 'Embarked', 'NewFare', 'NewAge']]  # features
y_train=train['Survived']  # labels

X_test=test[['Pclass', 'Sex', 'Embarked', 'NewFare', 'NewAge']]  # features
y_test=test['Survived']  # labels

In [None]:
# initialize and train RandomForest
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)

# predict labels of test dataset
y_pred=rf.predict(X_test)

# comparing predicted labels against real labels
print("Prediction Accuracy: ", metrics.accuracy_score(y_test, y_pred))

In [None]:
submit = pd.DataFrame({'PassengerId':test_original['PassengerId'], 'Survived':y_pred})
print(submit.head(5))
submit.to_csv('C:/Users/Andrew Mark/Google Drive/Projects/Kaggle/submission.csv', index = False)