Setting up

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics

import os
print(os.listdir("../input"))

training_file = "../input/titanic/train.csv"
testing_file = "../input/titanic/test.csv"
control_file = "../input/titanic/gender_submission.csv"

['titanic']


In [2]:
training_data = pd.read_csv(training_file)
testing_data = pd.read_csv(testing_file)
control_data = pd.read_csv(control_file)
print(training_data)
print(testing_data)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

Data preprocessing

In [3]:
# remove extranoues features from data
def data_drop(df,x):
    df=df.drop("Ticket", axis=1)
    df=df.drop("Name", axis=1)
    if x == 0:
        df=df.drop("PassengerId", axis=1)
    #df=df.drop("Fare", axis=1)
    df=df.drop("Cabin", axis=1)
    #df=df.drop("Age", axis=1)
    return df

control_testing = data_drop(testing_data, 1)
training_data = data_drop(training_data,0)
testing_data = data_drop(testing_data,0)
print(training_data)
print(testing_data)

     Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0           0       3    male  22.0      1      0   7.2500        S
1           1       1  female  38.0      1      0  71.2833        C
2           1       3  female  26.0      0      0   7.9250        S
3           1       1  female  35.0      1      0  53.1000        S
4           0       3    male  35.0      0      0   8.0500        S
..        ...     ...     ...   ...    ...    ...      ...      ...
886         0       2    male  27.0      0      0  13.0000        S
887         1       1  female  19.0      0      0  30.0000        S
888         0       3  female   NaN      1      2  23.4500        S
889         1       1    male  26.0      0      0  30.0000        C
890         0       3    male  32.0      0      0   7.7500        Q

[891 rows x 8 columns]
     Pclass     Sex   Age  SibSp  Parch      Fare Embarked
0         3    male  34.5      0      0    7.8292        Q
1         3  female  47.0      1      0   

In [4]:
# remove cells from the training set with missing relevant data
def data_check(df):
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df = df.dropna(axis=0)
    df = df.reset_index()
    df = df.drop("index", axis=1)
    return df
control_testing = data_check(control_testing)
training_data = data_check(training_data)
testing_data = data_check(testing_data)
print(training_data)
print(testing_data)

     Survived  Pclass     Sex        Age  SibSp  Parch     Fare Embarked
0           0       3    male  22.000000      1      0   7.2500        S
1           1       1  female  38.000000      1      0  71.2833        C
2           1       3  female  26.000000      0      0   7.9250        S
3           1       1  female  35.000000      1      0  53.1000        S
4           0       3    male  35.000000      0      0   8.0500        S
..        ...     ...     ...        ...    ...    ...      ...      ...
884         0       2    male  27.000000      0      0  13.0000        S
885         1       1  female  19.000000      0      0  30.0000        S
886         0       3  female  29.699118      1      2  23.4500        S
887         1       1    male  26.000000      0      0  30.0000        C
888         0       3    male  32.000000      0      0   7.7500        Q

[889 rows x 8 columns]
     Pclass     Sex       Age  SibSp  Parch      Fare Embarked
0         3    male  34.50000      0 

Convert alphanumeric data to numeric

In [5]:
def alpha_to_num(df):
    df['Sex'].mask(df['Sex'] == 'male', 0, inplace=True)
    df['Sex'].mask(df['Sex'] == 'female', 1, inplace=True)
    df['Embarked'].mask(df['Embarked'] == 'S', 0, inplace=True)
    df['Embarked'].mask(df['Embarked'] == 'C', 1, inplace=True)
    df['Embarked'].mask(df['Embarked'] == 'Q', 2, inplace=True)
    
    df['Embarked'] = df['Embarked'].astype('category')
    df['Sex'] = df['Sex'].astype('category')
    return df
control_testing = alpha_to_num(control_testing)
training_data = alpha_to_num(training_data)
testing_data = alpha_to_num(testing_data)
print(training_data)
print(testing_data)

     Survived  Pclass Sex        Age  SibSp  Parch     Fare Embarked
0           0       3   0  22.000000      1      0   7.2500        0
1           1       1   1  38.000000      1      0  71.2833        1
2           1       3   1  26.000000      0      0   7.9250        0
3           1       1   1  35.000000      1      0  53.1000        0
4           0       3   0  35.000000      0      0   8.0500        0
..        ...     ...  ..        ...    ...    ...      ...      ...
884         0       2   0  27.000000      0      0  13.0000        0
885         1       1   1  19.000000      0      0  30.0000        0
886         0       3   1  29.699118      1      2  23.4500        0
887         1       1   0  26.000000      0      0  30.0000        1
888         0       3   0  32.000000      0      0   7.7500        2

[889 rows x 8 columns]
     Pclass Sex       Age  SibSp  Parch      Fare Embarked
0         3   0  34.50000      0      0    7.8292        2
1         3   1  47.00000     

Training and Testing Decision Trees

In [6]:
x = training_data[['Pclass', 'Sex', 'Age','SibSp', 'Parch', 'Fare', 'Embarked']].to_numpy()
y = training_data[['Survived']].to_numpy().ravel()

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state = 42)

testing_data = testing_data.to_numpy()
model = RandomForestClassifier(max_depth = 5, min_impurity_decrease=0.01, n_estimators = 501, bootstrap = False)
model = model.fit(X_train, y_train)
print("Model accuracy (training): " + str(round(model.score(X_test, y_test), 5)))
survival_predictions = model.predict(testing_data)

Model accuracy (training): 0.83582


Formatting and Comparing Results

In [7]:
results = pd.DataFrame(survival_predictions, columns = ['Survived'])
pass_id = control_testing["PassengerId"]
results = results.join(pass_id)
cols = results.columns.tolist()
cols = cols[-1:] + cols[:-1]
results = results[cols]
#print(results)

def fill_missing_range(df, field, range_from, range_to, range_step=1, fill_with=0):
    return df\
      .merge(how='right', on=field,
            right = pd.DataFrame({field:np.arange(range_from, range_to, range_step)}))\
      .sort_values(by=field).reset_index().fillna(fill_with).drop(['index'], axis=1)

results = fill_missing_range(results, "PassengerId", 892, 1310, 1, -1)
results["Survived"] = results["Survived"].astype(int)
print(results)

print("Model accuracy (predicted vs gendered): " + str(round(accuracy_score(control_data["Survived"], results["Survived"]), 5)))
results.to_csv("submission.csv", index=False)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
Model accuracy (predicted vs gendered): 0.83493
