In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Load in training and testing data
train_data = pd.read_csv("../input/titanic/train.csv")
test_data = pd.read_csv("../input/titanic/test.csv")

In [3]:
train_data.isna().any()

In [4]:
# Decide feature parameters
# Drop Name & PassengerId since it's not imformative, Cabin since it's missing too many values
train_data = train_data.drop(['PassengerId', 'Cabin'],axis=1)
test_data = test_data.drop(['PassengerId', 'Cabin'],axis=1)
for col in train_data:
    print(col)
    print(train_data[col].unique())

In [5]:
# Pre-process data
# Embarked: {nan, S, C, Q} => {0, 1, 2, 3}
def switch_Embarked(x):
    if x == "S":
        return 1
    elif x == "C":
        return 2
    elif x == "Q":
        return 3
    else:
        return 0

train_data["Embarked"] = train_data["Embarked"].apply(switch_Embarked)
test_data["Embarked"] = test_data["Embarked"].apply(switch_Embarked)

In [6]:
# Ticket: some ticket contains alphabetical characters
# Parse ticket values so that all values are numeric
def parseTicket2(x):
    x = x.split(" ")
    if len(x) > 1:
        return (int)(x[-1])
    elif x[0].isnumeric():
        return (int)(x[0])
    else:
        return 0

def parseTicket1(x):
    x = x.split(" ")
    if len(x) > 1:
        return x[0][0]
    elif x[0].isalpha():
        return x[0][0]
    else:
        return 0
    
train_data["Ticket1"] = train_data["Ticket"].apply(parseTicket1)
train_data["Ticket2"] = train_data["Ticket"].apply(parseTicket2)
test_data["Ticket1"] = test_data["Ticket"].apply(parseTicket1)
test_data["Ticket2"] = test_data["Ticket"].apply(parseTicket2)
TicketList = np.delete(train_data["Ticket1"].unique(), 3)
for i in range(len(TicketList)):
    train_data.loc[train_data["Ticket1"] == TicketList[i], "Ticket1"] = i+1
    test_data.loc[test_data["Ticket1"] == TicketList[i], "Ticket1"] = i+1
# Test data might have other values, treat it as missing values
test_data["Ticket1"] = pd.to_numeric(test_data["Ticket1"],errors='coerce').fillna(0)

In [7]:
# Create groups

In [8]:
unique_Ticket1 = np.sort(train_data["Ticket1"].unique())

In [9]:
for i in range(len(unique_Ticket1)):
    new_column_name = "Ticket1_" + str(i)
    if i == 0:
        val = unique_Ticket1[i]
        train_data.loc[train_data['Ticket1'] < val, new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[test_data['Ticket1'] < val, new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)
    elif i+1 == len(unique_Ticket1):
        val = unique_Ticket1[i]
        train_data.loc[train_data['Ticket1'] > val, new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[test_data['Ticket1'] > val, new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)
    else:
        val_1 = unique_Ticket1[i-1]
        val_2 = unique_Ticket1[i]
        train_data.loc[(train_data['Ticket1'] < val_2) & (train_data['Ticket1'] >= val_1), new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[(test_data['Ticket1'] < val_2) & (test_data['Ticket1'] >= val_1), new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)

In [10]:
np.sort(train_data["Fare"].unique())

In [11]:
# Fare splits {10, 20, 30, 50, 100}
fare_splits = [10, 20, 30, 50, 100]
for i in range(len(fare_splits)):
    new_column_name = "Fare_" + str(i)
    if i == 0:
        val = fare_splits[i]
        train_data.loc[train_data['Fare'] < val, new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[test_data['Fare'] < val, new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)
    elif i+1 == len(fare_splits):
        val = fare_splits[i]
        train_data.loc[train_data['Fare'] > val, new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[test_data['Fare'] > val, new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)
    else:
        val_1 = fare_splits[i-1]
        val_2 = fare_splits[i]
        train_data.loc[(train_data['Fare'] < val_2) & (train_data['Fare'] >= val_1), new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[(test_data['Fare'] < val_2) & (test_data['Fare'] >= val_1), new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)

In [12]:
np.sort(train_data["Ticket2"].unique())

In [13]:
# Ticket2 splits {10000, 20000, 100000, 200000, 300000}
Ticket2_split = [10000, 20000, 100000, 200000, 300000]
for i in range(len(Ticket2_split)):
    new_column_name = "Ticket2_" + str(i)
    if i == 0:
        val = Ticket2_split[i]
        train_data.loc[train_data['Ticket2'] < val, new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[test_data['Ticket2'] < val, new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)
    elif i+1 == len(Ticket2_split):
        val = Ticket2_split[i]
        train_data.loc[train_data['Ticket2'] > val, new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[test_data['Ticket2'] > val, new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)
    else:
        val_1 = Ticket2_split[i-1]
        val_2 = Ticket2_split[i]
        train_data.loc[(train_data['Ticket2'] < val_2) & (train_data['Ticket2'] >= val_1), new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[(test_data['Ticket2'] < val_2) & (test_data['Ticket2'] >= val_1), new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)

In [14]:
def parse_name(x):
    x = x.split(" ")
    if len(x) > 1:
        for i in x:
            if "." in i:
                return i
    else:
        return 0
train_data["Name"] = train_data["Name"].apply(parse_name)
test_data["Name"] = test_data["Name"].apply(parse_name)
len(train_data["Name"].unique())
nameList = train_data["Name"].unique()
for i in range(len(nameList)):
    train_data.loc[train_data["Name"] == nameList[i], "Name"] = i+1
    test_data.loc[test_data["Name"] == nameList[i], "Name"] = i+1
# Test data might have other values, treat it as missing values
test_data["Name"] = pd.to_numeric(test_data["Name"],errors='coerce').fillna(0)

In [15]:
unique_name = np.sort(train_data["Name"].unique())

In [16]:
for i in range(len(unique_name)):
    new_column_name = "Name_" + str(i)
    if i == 0:
        val = unique_name[i]
        train_data.loc[train_data['Name'] < val, new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[test_data['Name'] < val, new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)
    elif i+1 == len(unique_name):
        val = unique_name[i]
        train_data.loc[train_data['Name'] > val, new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[test_data['Name'] > val, new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)
    else:
        val_1 = unique_name[i-1]
        val_2 = unique_name[i]
        train_data.loc[(train_data['Name'] < val_2) & (train_data['Name'] >= val_1), new_column_name] = 1
        train_data[new_column_name] = train_data[new_column_name].fillna(0)
        test_data.loc[(test_data['Name'] < val_2) & (test_data['Name'] >= val_1), new_column_name] = 1
        test_data[new_column_name] = test_data[new_column_name].fillna(0)

In [17]:
# Switching feature "Sex" from (male, female) to (1,0)
def switch_Sex(x):
    if x == "male":
        return 1
    elif x == "female":
        return 0
    else:
        return -1
    
train_data["Sex"] = train_data["Sex"].apply(switch_Sex)
test_data["Sex"] = test_data["Sex"].apply(switch_Sex)

In [18]:
train_data = train_data.drop(['Ticket', 'Ticket1', 'Ticket2', 'Name', 'Fare', 'Embarked'], axis=1)
test_data = test_data.drop(['Ticket', 'Ticket1', 'Ticket2', 'Name', 'Fare', 'Embarked'], axis=1)

In [19]:
# Just in case any column's missing data, fill nan with mean

for col in train_data:
    train_data[col].fillna(value=train_data[col].mean(), inplace=True)
for col in test_data:
    test_data[col].fillna(value=test_data[col].mean(), inplace=True)

In [20]:
train_data["Family"] = train_data["SibSp"] + train_data["Parch"]
train_data = train_data.drop(['SibSp','Parch'],axis=1)
test_data["Family"] = test_data["SibSp"] + test_data["Parch"]
test_data = test_data.drop(['SibSp','Parch'],axis=1)

In [21]:
train_data.isna().any()

In [22]:
train_y = train_data[['Survived']]
train_X = train_data.drop(['Survived'],axis=1)
feature_names = train_X.columns
# Converting dataframe to numpy array
X_train = train_X.to_numpy()
y_train = train_y.to_numpy()
X_test = test_data.to_numpy()

In [23]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
# Define the model
model = RandomForestClassifier()
result = cross_validate(model, X_train, y_train,return_estimator=True)

In [24]:
max_index = result['test_score'].argmax()
model = result['estimator'][max_index]

In [25]:
sorted_index = model.feature_importances_.argsort()
plt.barh(feature_names[sorted_index], model.feature_importances_[sorted_index])

In [26]:
output = model.predict(X_test)

In [27]:
index_array = np.zeros(np.shape(output))
for i in range(np.shape(index_array)[0]):
    index_array[i] = 892+i
index_array = np.reshape(index_array, (np.shape(index_array)[0],1))
output = np.reshape(output, (np.shape(output)[0],1))
output = np.hstack((index_array,output))

In [28]:
output_df = pd.DataFrame(output, columns = ['PassengerId', 'Survived'], dtype=int)
output_df.to_csv('output.csv',index=False)

In [29]:
out = model.predict(X_train)
out = np.reshape(out, (np.shape(out)[0],1))
out = out + y_train
true_pos = np.count_nonzero(out == 2)
true_neg = np.count_nonzero(out == 0)
(true_pos + true_neg) / np.shape(out)[0]