In [78]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
import subprocess
%matplotlib inline

sns.set(rc={'figure.figsize':(11.7,8.27)})

def load_train_data(split=True):
    target = "Survived"
    data = pd.read_csv("./train.csv", index_col="PassengerId")
    print("load_train_data: done")
    
    if split:
        return split_features_target(data, target)
    else:
        return data
    
def split_features_target(df, target="Survived"):
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

def load_x_test():
    return pd.read_csv("./test.csv", index_col="PassengerId")
    
# Save Kaggle submission file
def submission_df(y_pred):
    X_test = load_x_test()
    return pd.DataFrame(y_pred, index=X_test.index, columns=["Survived"])

def save_submission_file(y_pred, filename):
    df = submission_df(y_pred)
    path = "./" + filename

    try:
        df.to_csv(path)
    except Exception:
        print("Couldn’t save submission.")
    else:
        print("Submission saved.")
        
# Submit score to Kaggle
def submit_predictions(y_pred, filename, message):
    save_submission_file(y_pred, filename)

    completed_process = subprocess.run(
        [
            "kaggle",
            "competitions",
            "submit",
            "-c",
            "titanic",
            "-f",
            filename,
            "-m",
            message
        ], 
        capture_output=True,
        text=True
    )
    
    print(completed_process.stdout)
    
class DFSimpleImputer(SimpleImputer):
    def transform(self, X, y=None):
        return pd.DataFrame(super().transform(X), columns=X.columns)

class DFOneHotEncoder(OneHotEncoder):
    def transform(self, X, y=None):
        column_names = X.columns
        X_transformed = super().transform(X)
        return pd.DataFrame.sparse.from_spmatrix(X_transformed, columns=self.get_feature_names(column_names))

def get_preprocessing_pipeline(variant):
    if variant == "default":
        return preprocessing_pipeline_default()
    elif variant == "default_no_dropped_features":
        return preprocessing_pipeline_default_no_dropped_features()
    elif variant == "default_no_dropped_features_log_poly":
        return preprocessing_pipeline_default_no_dropped_features_log_poly()
    else:
        return preprocessing_pipeline_default()
    
def get_pipeline(model, preprocessing_variant):
    return Pipeline([
        ("preprocessing", get_preprocessing_pipeline(preprocessing_variant)),
        ("model", model)
    ])
    
def compare_models(models, preprocessing_variant="default"):
    X_train, y_train = load_train_data()
    
    results = []
    
    for model in models:
        pipe = get_pipeline(model, preprocessing_variant)
        scores = cross_val_score(pipe, X_train, y_train)
        
        result = {
            "model": type(model).__name__,
            "accuracy": "Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() * 2),
        }
        
        results.append(result)
        
    return results

In [12]:
def preprocessing_pipeline_default():
    sex_pipeline = Pipeline([
        ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    embarked_pipeline = Pipeline([
        ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    def generate_age_derived_features(X):
        has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
        is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
        is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")
        return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")

    def discretize_age(X):
        bins = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, np.inf]
        labels = ["0+", "10+", "20+", "30+", "40+", "50+", "60+", "70+"]

        age_group = pd.cut(X["Age"], bins=bins, labels=labels, include_lowest=True)
        age_group = age_group.rename("AgeGroup")

        result = pd.concat([X, age_group], axis="columns")    
        return result

    def drop_age(X):
        X = X.drop("Age", axis="columns")
        return X

    age_pipeline = Pipeline([
        ("age_transformer", FunctionTransformer(generate_age_derived_features)),
        ('impute', DFSimpleImputer(strategy="mean")),
        ("discretize", FunctionTransformer(discretize_age)),
        ("drop", FunctionTransformer(drop_age)),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    def attributes_dropper(df):
        result_df = df.drop("Name", axis="columns")
        result_df = result_df.drop("Ticket", axis="columns")
        return result_df

    attributes_dropper_transformer = FunctionTransformer(attributes_dropper)

    fare_pipeline = Pipeline([
        ('impute_missing', DFSimpleImputer(strategy="mean")),
        ("log_transform", FunctionTransformer(np.log1p)),
        ('standard_scaler', StandardScaler())
    ])

    def generate_cabin_derived_features(df):
        has_cabin = df["Cabin"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasCabin")
        cabin_letter = df["Cabin"].apply(lambda string: string[0] if pd.notnull(string) else "U")
        cabin_letter = cabin_letter.rename("CabinLetter")

        return pd.concat([df, has_cabin, cabin_letter], axis="columns")

    def drop_cabin(df):
        result = df.drop("Cabin", axis="columns")
        return result

    cabin_pipeline = Pipeline([
        ("generate_derived_features", FunctionTransformer(generate_cabin_derived_features)),
        ('impute', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ("drop", FunctionTransformer(drop_cabin)),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    return ColumnTransformer([
            ("sex", sex_pipeline, ["Sex"]),
            ("embarked", embarked_pipeline, ["Embarked"]),
            ("age", age_pipeline, ["Age"]),
            ("attributes_dropper", attributes_dropper_transformer, ["Name", "Ticket"]),
            ("fare", fare_pipeline, ["Fare"]),
            ("cabin", cabin_pipeline, ["Cabin"]),
        ],
        remainder="passthrough"
    )

In [62]:
def preprocessing_pipeline_default_no_dropped_features():
    sex_pipeline = Pipeline([
        ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    embarked_pipeline = Pipeline([
        ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    def generate_age_derived_features(X):
        has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
        is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
        is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")
        return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")

    def discretize_age(X):
        bins = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, np.inf]
        labels = ["0+", "10+", "20+", "30+", "40+", "50+", "60+", "70+"]

        age_group = pd.cut(X["Age"], bins=bins, labels=labels, include_lowest=True)
        age_group = age_group.rename("AgeGroup")

        result = pd.concat([X, age_group], axis="columns")    
        return result

    age_pipeline = Pipeline([
        ("age_transformer", FunctionTransformer(generate_age_derived_features)),
        ('impute', DFSimpleImputer(strategy="mean")),
        ("discretize", FunctionTransformer(discretize_age)),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    fare_pipeline = Pipeline([
        ('impute_missing', DFSimpleImputer(strategy="mean")),
        ("log_transform", FunctionTransformer(np.log1p)),
        ('standard_scaler', StandardScaler())
    ])

    def generate_cabin_derived_features(df):
        has_cabin = df["Cabin"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasCabin")
        cabin_letter = df["Cabin"].apply(lambda string: string[0] if pd.notnull(string) else "U")
        cabin_letter = cabin_letter.rename("CabinLetter")

        return pd.concat([df, has_cabin, cabin_letter], axis="columns")

    cabin_pipeline = Pipeline([
        ("generate_derived_features", FunctionTransformer(generate_cabin_derived_features)),
        ('impute', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])
    
    name_ticket_pipe = Pipeline([
        ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    return ColumnTransformer([
            ("sex", sex_pipeline, ["Sex"]),
            ("embarked", embarked_pipeline, ["Embarked"]),
            ("age", age_pipeline, ["Age"]),
            ("name_ticket_pipe", name_ticket_pipe, ["Name", "Ticket"]),
            ("fare", fare_pipeline, ["Fare"]),
            ("cabin", cabin_pipeline, ["Cabin"]),
        ],
        remainder="passthrough"
    )

In [65]:
def preprocessing_pipeline_default_no_dropped_features_log_poly():
    sex_pipeline = Pipeline([
        ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    embarked_pipeline = Pipeline([
        ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    def generate_age_derived_features(X):
        has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
        is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
        is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")
        return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")

    def discretize_age(X):
        bins = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, np.inf]
        labels = ["0+", "10+", "20+", "30+", "40+", "50+", "60+", "70+"]

        age_group = pd.cut(X["Age"], bins=bins, labels=labels, include_lowest=True)
        age_group = age_group.rename("AgeGroup")

        result = pd.concat([X, age_group], axis="columns")    
        return result
    
    def add_log_poly(X):
        X["Age_Log"] = np.log1p(X["Age"])
        X["Age_Square"] = np.square(X["Age"])
        return X

    age_pipeline = Pipeline([
        ("age_transformer", FunctionTransformer(generate_age_derived_features)),
        ('impute', DFSimpleImputer(strategy="mean")),
        ("discretize", FunctionTransformer(discretize_age)),
#         ("add_log_poly", FunctionTransformer(add_log_poly)),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    fare_pipeline = Pipeline([
        ('impute_missing', DFSimpleImputer(strategy="mean")),
        ("log_transform", FunctionTransformer(np.log1p)),
        ('standard_scaler', StandardScaler())
    ])

    def generate_cabin_derived_features(df):
        has_cabin = df["Cabin"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasCabin")
        cabin_letter = df["Cabin"].apply(lambda string: string[0] if pd.notnull(string) else "U")
        cabin_letter = cabin_letter.rename("CabinLetter")

        return pd.concat([df, has_cabin, cabin_letter], axis="columns")

    cabin_pipeline = Pipeline([
        ("generate_derived_features", FunctionTransformer(generate_cabin_derived_features)),
        ('impute', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])
    
    name_ticket_pipe = Pipeline([
        ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
        ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
    ])

    return ColumnTransformer([
            ("sex", sex_pipeline, ["Sex"]),
            ("embarked", embarked_pipeline, ["Embarked"]),
            ("age", age_pipeline, ["Age"]),
            ("name_ticket_pipe", name_ticket_pipe, ["Name", "Ticket"]),
            ("fare", fare_pipeline, ["Fare"]),
            ("cabin", cabin_pipeline, ["Cabin"]),
        ],
        remainder="passthrough"
    )

In [72]:
def get_models():
    return [
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        KNeighborsClassifier(),
        GaussianNB(),
        DecisionTreeClassifier(),
        RandomForestClassifier(random_state=42),
        RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1),
        SVC(),
        LinearSVC(random_state=42, max_iter=10000),
    ]

def get_preprocessing_pipelines():
    return [
        "default",
        "default_no_dropped_features",
        "default_no_dropped_features_log_poly",
    ]

results_per_pipeline = []

for pipe in get_preprocessing_pipelines():
    result = compare_models(get_models(), preprocessing_variant=pipe)
    results_per_pipeline.append({pipe: result})
    
for results in results_per_pipeline:
    key = list(results.keys())[0]
    model_results = results[key]
    
    print("\n===========")
    print(f"{key}\n")

    for result in model_results:
        print(result["model"])
        print(result["accuracy"])
        print()

load_train_data: done
load_train_data: done
load_train_data: done

default

LogisticRegression
Accuracy: 0.79688 (+/- 0.03)

LinearDiscriminantAnalysis
Accuracy: 0.80473 (+/- 0.03)

KNeighborsClassifier
Accuracy: 0.79691 (+/- 0.04)

GaussianNB
Accuracy: 0.70149 (+/- 0.07)

DecisionTreeClassifier
Accuracy: 0.81823 (+/- 0.05)

RandomForestClassifier
Accuracy: 0.79462 (+/- 0.06)

RandomForestClassifier
Accuracy: 0.81038 (+/- 0.07)

SVC
Accuracy: 0.82716 (+/- 0.03)

LinearSVC
Accuracy: 0.80024 (+/- 0.04)


default_no_dropped_features

LogisticRegression
Accuracy: 0.82267 (+/- 0.03)

LinearDiscriminantAnalysis
Accuracy: 0.74302 (+/- 0.06)

KNeighborsClassifier
Accuracy: 0.79015 (+/- 0.05)

GaussianNB
Accuracy: 0.48704 (+/- 0.05)

DecisionTreeClassifier
Accuracy: 0.81933 (+/- 0.06)

RandomForestClassifier
Accuracy: 0.82719 (+/- 0.06)

RandomForestClassifier
Accuracy: 0.71833 (+/- 0.04)

SVC
Accuracy: 0.82155 (+/- 0.03)

LinearSVC
Accuracy: 0.83614 (+/- 0.01)


default_no_dropped_features_log

In [66]:
compare_models(get_models(), preprocessing_variant="default_no_dropped_features_log_poly")

load_train_data: done


[{'model': 'LogisticRegression', 'accuracy': 'Accuracy: 0.82267 (+/- 0.03)'},
 {'model': 'LinearDiscriminantAnalysis',
  'accuracy': 'Accuracy: 0.74302 (+/- 0.06)'},
 {'model': 'KNeighborsClassifier', 'accuracy': 'Accuracy: 0.79015 (+/- 0.05)'},
 {'model': 'GaussianNB', 'accuracy': 'Accuracy: 0.48704 (+/- 0.05)'},
 {'model': 'DecisionTreeClassifier',
  'accuracy': 'Accuracy: 0.81144 (+/- 0.04)'},
 {'model': 'RandomForestClassifier',
  'accuracy': 'Accuracy: 0.82719 (+/- 0.06)'},
 {'model': 'SVC', 'accuracy': 'Accuracy: 0.82155 (+/- 0.03)'},
 {'model': 'LinearSVC', 'accuracy': 'Accuracy: 0.83614 (+/- 0.01)'}]

In [81]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
pipe = get_pipeline(model, "default")

X_train, y_train = load_train_data()
X_test = load_x_test()

scores = cross_val_score(pipe, X_train, y_train)
print("Accuracy: %0.5f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test)
output = pd.DataFrame({'PassengerId': X_test.index, 'Survived': predictions})
output.to_csv('submissions/06-random-forest.csv', index=False)
print("Your submission was successfully saved!")

load_train_data: done
Accuracy: 0.81038 (+/- 0.07)
Your submission was successfully saved!
