In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.base import BaseEstimator, TransformerMixin
import subprocess
%matplotlib inline

sns.set(rc={'figure.figsize':(11.7,8.27)})

def load_train_data(split=True):
    target = "Survived"
    data = pd.read_csv("./train.csv", index_col="PassengerId")
    print("load_train_data: done")
    
    if split:
        return split_features_target(data, target)
    else:
        return data
    
def split_features_target(df, target="Survived"):
    features = [column for column in df.columns if not column == target]
    return df[features], df[target]

class DFSimpleImputer(SimpleImputer):
    def transform(self, X, y=None):
        return pd.DataFrame(super().transform(X), columns=X.columns)

class DFOneHotEncoder(OneHotEncoder):
    def transform(self, X, y=None):
        column_names = X.columns
        X_transformed = super().transform(X)
        return pd.DataFrame.sparse.from_spmatrix(X_transformed, columns=self.get_feature_names(column_names))
    
# Save Kaggle submission file
def submission_df(y_pred):
    X_test = load_x_test()
    return pd.DataFrame(y_pred, index=X_test.index, columns=["Survived"])

def save_submission_file(y_pred, filename):
    df = submission_df(y_pred)
    path = "./" + filename

    try:
        df.to_csv(path)
    except Exception:
        print("Couldn’t save submission.")
    else:
        print("Submission saved.")
        
# Submit score to Kaggle
def submit_predictions(y_pred, filename, message):
    save_submission_file(y_pred, filename)

    completed_process = subprocess.run(
        [
            "kaggle",
            "competitions",
            "submit",
            "-c",
            "titanic",
            "-f",
            filename,
            "-m",
            message
        ], 
        capture_output=True,
        text=True
    )
    
    print(completed_process.stdout)

In [34]:
# Sex pipeline

sex_pipeline = Pipeline([
    ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])


# Embarked pipeline

embarked_pipeline = Pipeline([
    ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])


# Age pipeline
    
def generate_age_derived_features(X):
    has_age = X["Age"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasAge")
    is_young_child = X["Age"].apply(lambda age: 1 if age < 10 else 0).rename("IsYoungChild")
    is_infant = X["Age"].apply(lambda age: 1 if age < 1 else 0).rename("IsInfant")
    return pd.concat([X, has_age, is_young_child, is_infant], axis="columns")

def discretize_age(X):
    bins = [0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, np.inf]
    labels = ["0+", "10+", "20+", "30+", "40+", "50+", "60+", "70+"]

    age_group = pd.cut(X["Age"], bins=bins, labels=labels, include_lowest=True)
    age_group = age_group.rename("AgeGroup")
    
    result = pd.concat([X, age_group], axis="columns")    
    return result

def drop_age(X):
    X = X.drop("Age", axis="columns")
    return X

age_pipeline = Pipeline([
    ("age_transformer", FunctionTransformer(generate_age_derived_features)),
    ('impute', DFSimpleImputer(strategy="mean")),
    ("discretize", FunctionTransformer(discretize_age)),
    ("drop", FunctionTransformer(drop_age)),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])


# Dropping attributes

def attributes_dropper(df):
    result_df = df.drop("Name", axis="columns")
    result_df = result_df.drop("Ticket", axis="columns")
    return result_df

attributes_dropper_transformer = FunctionTransformer(attributes_dropper)


# Fare pipeline

fare_pipeline = Pipeline([
    ('impute_missing', DFSimpleImputer(strategy="mean")),
    ("log_transform", FunctionTransformer(np.log1p)),
    ('standard_scaler', StandardScaler())
])


# Cabin pipeline

def generate_cabin_derived_features(df):
    has_cabin = df["Cabin"].apply(lambda x: 0 if pd.isnull(x) else 1).rename("HasCabin")
    cabin_letter = df["Cabin"].apply(lambda string: string[0] if pd.notnull(string) else "U")
    cabin_letter = cabin_letter.rename("CabinLetter")

    return pd.concat([df, has_cabin, cabin_letter], axis="columns")

def drop_cabin(df):
    result = df.drop("Cabin", axis="columns")
    return result

cabin_pipeline = Pipeline([
    ("generate_derived_features", FunctionTransformer(generate_cabin_derived_features)),
    ('impute', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
    ("drop", FunctionTransformer(drop_cabin)),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])

In [37]:
name_ticket_pipe = Pipeline([
    ('imputer', DFSimpleImputer(strategy="constant", fill_value="Unknown")),
    ('one_hot_encoder', DFOneHotEncoder(handle_unknown="ignore"))
])

In [41]:
preprocessing_pipeline = ColumnTransformer([
        ("sex", sex_pipeline, ["Sex"]),
        ("embarked", embarked_pipeline, ["Embarked"]),
        ("age", age_pipeline, ["Age"]),
        ("name_ticket_pipe", name_ticket_pipe, ["Name", "Ticket"]),
        ("fare", fare_pipeline, ["Fare"]),
        ("cabin", cabin_pipeline, ["Cabin"]),
    ],
    remainder="passthrough"
)

model = svm.LinearSVC(random_state=42, max_iter=10000)

pipe = Pipeline([
    ("preprocessing", preprocessing_pipeline),
    ("model", model)
])

In [24]:
X_train, y_train = load_train_data()
scores = cross_val_score(pipe, X_train, y_train)
scores.mean()

load_train_data: done


0.8047266336074321

In [21]:
X_train, y_train = load_train_data()
scores = cross_val_score(pipe, X_train, y_train)
scores.mean()

load_train_data: done


0.8047266336074321

In [46]:
X_train, y_train = load_train_data()
scores = cross_val_score(pipe, X_train, y_train)
scores.mean()

load_train_data: done


0.8339024543343168

In [45]:
def load_x_test():
    return pd.read_csv("./test.csv", index_col="PassengerId")

X_test = load_x_test()

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# submit_predictions(
#     y_pred, 
#     "04-preprocessing-pipeline.csv",
#     "Implemented own custom preprocessing pipeline. LinearSVC."
# )

Submission saved.
Successfully submitted to Titanic: Machine Learning from Disaster
