In [640]:
import pandas as pd
import numpy as np

# Load the training + validation sets
test_csv = pd.read_csv("test.csv")
train_csv = pd.read_csv("train.csv")

train_csv

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [641]:
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn import impute

def preprocess_data(data: pd.DataFrame) -> pd.DataFrame:
    processed = data.copy(deep=True)

    # Drop columns that have no correlation
    processed.drop(["Ticket", "Name"], axis=1, inplace=True)

    # Convert gender
    processed.replace(to_replace={"Sex": "male"}, value=0.0, inplace=True)
    processed.replace(to_replace={"Sex": "female"}, value=1.0, inplace=True)

    # Invert the classes
    processed.replace(to_replace={"Pclass": "1"}, value=2, inplace=True)
    processed.replace(to_replace={"Pclass": "2"}, value=1, inplace=True)
    processed.replace(to_replace={"Pclass": "3"}, value=0, inplace=True)

    # Convert cabin number to an integer
    def normalize_cabin(letter):
        if isinstance(letter, int) or isinstance(letter, float):
            return letter
        
        letter_map = {
            "A": 1000,
            "B": 2000,
            "C": 3000,
            "D": 4000, 
            "E": 5000,
            "F": 6000,
            "G": 7000,
        }
        split_letter = letter.split(" ")[0]
        cabin = int(letter_map.get(split_letter[0].upper(), 0))
        # room = int(split_letter[1:]) if split_letter[1:] else 0
        room = 0
        
        # Convert cabin number to an int and then normalize
        return cabin + room

    processed["Cabin"] = pd.Series([normalize_cabin(c) for c in processed["Cabin"]])

    # Convert Cabin
    processed.replace(to_replace={"Embarked": "S"}, value=1, inplace=True)
    processed.replace(to_replace={"Embarked": "C"}, value=2, inplace=True)
    processed.replace(to_replace={"Embarked": "Q"}, value=3, inplace=True)

    # Fill in all NaN with -1
    # processed.fillna(-1, inplace=True)

    # Impute missing values + Scale the data
    imputer = impute.KNNImputer(n_neighbors=2, weights="distance")
    scaler = preprocessing.MinMaxScaler()

    normalized_data = scaler.fit_transform(imputer.fit_transform(processed.to_numpy()))
    scaled_processed = pd.DataFrame(normalized_data, columns=processed.columns)
    scaled_processed["PassengerId"] = processed["PassengerId"]

    return scaled_processed

# Process the data
train_data = preprocess_data(train_csv)
test_data = preprocess_data(test_csv)

# Split the data into inputs/output
# Available Features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Cabin", "Embarked"]

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Cabin", "Embarked"]
categorical_features = [True, True, False, False, False, False, False, True]

train_xdata = train_data[features]
train_ydata = train_data["Survived"]

test_xdata = test_data[features]
test_xdata


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1.0,0.0,0.452723,0.000,0.000000,0.015282,0.215997,1.0
1,1.0,1.0,0.617566,0.125,0.000000,0.013663,0.216412,0.0
2,0.5,0.0,0.815377,0.000,0.000000,0.018909,0.232188,1.0
3,1.0,0.0,0.353818,0.000,0.000000,0.016908,0.222638,0.0
4,1.0,1.0,0.287881,0.125,0.111111,0.023984,0.228361,0.0
...,...,...,...,...,...,...,...,...
413,1.0,0.0,0.422500,0.000,0.000000,0.015713,0.500000,0.0
414,0.0,1.0,0.512066,0.000,0.000000,0.212559,0.333333,0.5
415,1.0,0.0,0.505473,0.000,0.000000,0.014151,0.500000,0.0
416,1.0,0.0,0.472713,0.000,0.000000,0.015713,0.500000,0.0


In [655]:
from sklearn.ensemble import *
from sklearn.neural_network import *
from sklearn.naive_bayes import *
from sklearn.svm import *

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import *

def save_predictions(passenger_ids, survived, filename):
    """
    Save the passenger_ids and survival predictions to the given 
    """
    pd.DataFrame({"PassengerId": passenger_ids, "Survived": survived}).to_csv(f"solutions/{filename}", index=False)

def run_evaluation(model_type, x, y, test_x):
    """
    Evaluate a model type against
    """
    # Create 20% training/validation split
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.15, random_state=0)

    model = model_type.fit(x_train, y_train)
    model_name = type(model).__name__

    # Check validation accuracy on split training data
    y_training = model.predict(x_train).astype(np.int32)
    y_predictions = model.predict(x_val).astype(np.int32)
    training_accuracy = metrics.accuracy_score(y_train.to_numpy(), y_training)
    validation_accuracy = metrics.accuracy_score(y_val.to_numpy(), y_predictions)
    print(f"{model_name}\n\tTraining Accuracy: {training_accuracy}\n\tValidation Accuracy: {validation_accuracy}")

    return {
        "model_name": model_name,
        "model": model,
        "training_accuracy": training_accuracy,
        "validation_accuracy": validation_accuracy,
        "predictions": model.predict(test_x)
    }


# params = { 'max_depth': [3, 6, 10],
#            'learning_rate': [0.01, 0.05, 0.],
#            'n_estimators': [10, 20, 50],
#         #    'colsample_bytree': [0.3, 0.7],
#            'reg_alpha': [0, 1E-5],
#            'reg_lambda': [0, 1E-5],
#            }

# estimator = XGBClassifier(objective="multi:softmax", num_class=2, colsample_bytree=0.7)
# model = GridSearchCV(estimator=estimator, 
#                    param_grid=params,
#                    scoring='neg_mean_squared_error', 
#                    verbose=1)

# Instead of Grid Search - just select a model and manually tune parameters
# model = XGBClassifier(
#     objective="binary:logistic",
#     n_estimators=30,
#     learning_rate=0.1,
#     max_depth=10,
#     reg_alpha=1E-5,
#     reg_lambda=1E-5,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     gamma = 0.0,
#     scale_pos_weight = 1,
#     min_child_weight = 1,
# )

model = HistGradientBoostingClassifier(
    learning_rate=0.15,
    # categorical_features=categorical_features,
    # max_bins=32, 
    max_depth=5,
    max_iter=200, 
    l2_regularization=1E-3,
)

results = run_evaluation(model, train_xdata, train_ydata, test_xdata)

# Run the model on the Kaggle Test data
save_predictions(test_data["PassengerId"], results["predictions"], f"{results['model_name']}.csv")

HistGradientBoostingClassifier
	Training Accuracy: 0.9815059445178336
	Validation Accuracy: 0.8582089552238806
