In [576]:
import pandas as pd
import numpy as np

# Load the training + validation sets
test_csv = pd.read_csv("test.csv")
train_csv = pd.read_csv("train.csv")

train_csv

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [577]:
def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    processed = data.copy(deep=True)

    # Drop columns that have no correlation
    processed.drop(["Ticket", "Name"], axis=1, inplace=True)

    # Convert gender
    processed.replace(to_replace={"Sex": "male"}, value=1.0, inplace=True)
    processed.replace(to_replace={"Sex": "female"}, value=0.0, inplace=True)

    # Convert cabin number to an integer
    def intify_cabin(letter):
        if isinstance(letter, int):
            return letter
        
        letter_map = {
            "A": 1000,
            "B": 2000,
            "C": 3000,
            "D": 4000, 
            "E": 5000,
            "F": 6000,
            "G": 7000,
        }
        split_letter = letter.split(" ")[0]
        cabin = int(letter_map.get(split_letter[0].upper(), 0))
        room = int(split_letter[1:]) if split_letter[1:] else 0
        
        return cabin + room

    processed["Cabin"].fillna(-1, inplace=True)
    processed["Cabin"] = pd.Series([intify_cabin(c) for c in processed["Cabin"]])

    # Convert Cabin
    processed.replace(to_replace={"Embarked": "S"}, value=0.0, inplace=True)
    processed.replace(to_replace={"Embarked": "C"}, value=1.0, inplace=True)
    processed.replace(to_replace={"Embarked": "Q"}, value=2.0, inplace=True)

    # Fill in ages
    # processed["Age"].fillna(-1, inplace=True)

    # Fill in all NaN with -1
    processed.fillna(-1, inplace=True)

    return processed

# Process the data
train_data = preprocess_data(train_csv)
test_data = preprocess_data(test_csv)
print(train_data)

# Split the data into inputs/output
# Available Features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Cabin", "Embarked"]

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Cabin", "Embarked"]

train_xdata = train_data[features]
train_ydata = train_data["Survived"]

test_xdata = test_data[features]


     PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  \
0              1         0       3  1.0  22.0      1      0   7.2500      0   
1              2         1       1  0.0  38.0      1      0  71.2833   3085   
2              3         1       3  0.0  26.0      0      0   7.9250      0   
3              4         1       1  0.0  35.0      1      0  53.1000   3123   
4              5         0       3  1.0  35.0      0      0   8.0500      0   
..           ...       ...     ...  ...   ...    ...    ...      ...    ...   
886          887         0       2  1.0  27.0      0      0  13.0000      0   
887          888         1       1  0.0  19.0      0      0  30.0000   2042   
888          889         0       3  0.0   0.0      1      2  23.4500      0   
889          890         1       1  1.0  26.0      0      0  30.0000   3148   
890          891         0       3  1.0  32.0      0      0   7.7500      0   

     Embarked  
0         0.0  
1         1.0  
2  

In [620]:
def save_predictions(passenger_ids, survived, filepath):
    """
    Save the passenger_ids and survival predictions to the given 
    """
    pd.DataFrame({"PassengerId": passenger_ids, "Survived": survived}).to_csv(filepath, index=False)

def calculate_accuracy(y_test: pd.Series, y_pred) -> float:
    """
    Calculate the accuracy for an output set and 
    """
    assert y_test.size == y_pred.size
    accuracy = 1 - np.sum(y_test.to_numpy()  != y_pred) / y_pred.size
    return accuracy

def run_evaluation(model_type, x, y, test_x):
    """
    Evaluate a model type against
    """
    # Create 20% training/validation split
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)

    model = model_type.fit(x_train, y_train)
    model_name = type(model).__name__

    # Check validation accuracy on split training data
    y_training = model.predict(x_train)
    y_predictions = model.predict(x_val)
    training_accuracy = calculate_accuracy(y_train, y_training)
    validation_accuracy = calculate_accuracy(y_val, y_predictions)
    print(f"{model_name}\n\tTraining Accuracy: {training_accuracy}\n\tValidation Accuracy: {validation_accuracy}")

    return {
        "model_name": model_name,
        "model": model,
        "training_accuracy": training_accuracy,
        "validation_accuracy": validation_accuracy,
        "predictions": model.predict(test_x)
    }

from sklearn.ensemble import *
from sklearn.neural_network import *
from sklearn.naive_bayes import *
from sklearn.svm import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import *

# Create ensemble model
# model_type = XGBClassifier(
#     n_estimators=100,
#     objective="binary:logistic",
#     max_depth=10,
# )
# model_type = GradientBoostingClassifier(n_estimators = 900,
#                         learning_rate = 0.1,
#                         loss = 'exponential')

# model_type = SVC()

# NN Model
model_type = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(10, 10, 10, 10), max_iter=1000, random_state=1)

results = run_evaluation(model_type, train_xdata, train_ydata, test_xdata)

# Run the model on the Kaggle Test data
save_predictions(test_data["PassengerId"], results["predictions"], "xgboost.csv")

MLPClassifier
	Training Accuracy: 0.7092696629213483
	Validation Accuracy: 0.7541899441340782
