In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Load in training and testing data
train_data = pd.read_csv("../input/titanic/train.csv")
test_data = pd.read_csv("../input/titanic/test.csv")

In [3]:
train_data.isna().any()

In [4]:
# Decide feature parameters
# Drop Name & PassengerId since it's not imformative, Cabin since it's missing too many values
train_data = train_data.drop(['Name','PassengerId', 'Cabin'],axis=1)
test_data = test_data.drop(['Name','PassengerId', 'Cabin'],axis=1)
for col in train_data:
    print(col)
    print(train_data[col].unique())

In [5]:
# Pre-process data
# Embarked: {nan, S, C, Q} => {0, 1, 2, 3}
def switch_Embarked(x):
    if x == "S":
        return 1
    elif x == "C":
        return 2
    elif x == "Q":
        return 3
    else:
        return 0

train_data["Embarked"] = train_data["Embarked"].apply(switch_Embarked)
test_data["Embarked"] = test_data["Embarked"].apply(switch_Embarked)

In [6]:
# Ticket: some ticket contains alphabetical characters
# Parse ticket values so that all values are numeric
def parse(x):
    x = x.split(" ")
    if len(x) > 1:
        return (int)(x[-1])
    elif x[0].isnumeric():
        return (int)(x[0])
    else:
        return 0

train_data["Ticket"] = train_data["Ticket"].apply(parse)
test_data["Ticket"] = test_data["Ticket"].apply(parse)

In [7]:
# Switching feature "Sex" from (male, female) to (1,0)
def switch_Sex(x):
    if x == "male":
        return 1
    elif x == "female":
        return 0
    else:
        return -1
    
train_data["Sex"] = train_data["Sex"].apply(switch_Sex)
test_data["Sex"] = test_data["Sex"].apply(switch_Sex)

In [8]:
# Just in case any column's missing data, fill nan with mean

for col in train_data:
    train_data[col].fillna(value=train_data[col].mean(), inplace=True)
for col in test_data:
    test_data[col].fillna(value=test_data[col].mean(), inplace=True)

In [9]:
train_data.isna().any()

In [10]:
train_y = train_data[['Survived']]
train_X = train_data.drop(['Survived'],axis=1)
# Converting dataframe to numpy array
X_train = train_X.to_numpy()
y_train = train_y.to_numpy()
X_test = test_data.to_numpy()

In [11]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [12]:
output = model.predict(X_test)

In [13]:
index_array = np.zeros(np.shape(output))
for i in range(np.shape(index_array)[0]):
    index_array[i] = 892+i
index_array = np.reshape(index_array, (np.shape(index_array)[0],1))
output = np.reshape(output, (np.shape(output)[0],1))
output = np.hstack((index_array,output))

In [14]:
output_df = pd.DataFrame(output, columns = ['PassengerId', 'Survived'], dtype=int)
output_df.to_csv('output.csv',index=False)

In [15]:
out = model.predict(X_train)
out = np.reshape(out, (np.shape(out)[0],1))
out = out + y_train
true_pos = np.count_nonzero(out == 2)
true_neg = np.count_nonzero(out == 0)
(true_pos + true_neg) / np.shape(out)[0]