In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from math import ceil
from tqdm.notebook import tqdm
from itertools import combinations

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier

In [81]:
path = r"dataset\train.csv"
df= pd.read_csv(path)

# Feature Engineering

In [4]:
def ticketNo(ticketStr):
    if isinstance(ticketStr, str):
        noStr = re.findall(r"[0-9]*$", ticketStr)[0]
        if noStr.isnumeric():
            return int(noStr)
    return np.nan

def ticketTxt(ticketStr):
    if isinstance(ticketStr, str):
        noStr = re.findall(r"[0-9]*$", ticketStr)[0]
        return ticketStr[:-len(noStr)]
    return np.nan

df["ticketNo"] = df["Ticket"].apply(ticketNo)
df["ticketTxt"] = df["Ticket"].apply(ticketTxt)

In [5]:
def cabinNo(cabinStr):
    if isinstance(cabinStr, str):
        return re.findall(r"[0-9]*$", cabinStr)[0]
    return np.nan

def cabinLvl(cabinStr):
    if isinstance(cabinStr, str):
        return re.findall(r"^[A-Z]", cabinStr)[0]
    return np.nan

df["cabinLvl"] = df["Cabin"].apply(cabinLvl)
df["cabinNo"] = df["Cabin"].apply(cabinNo)

In [6]:
def title(name):
    if isinstance(name, str):
        return re.findall(r"\b\w*\.", name)[0]
    return np.nan


df["surname"] = df["Name"].apply(lambda x: x.split(",")[0])
df["withChild"] = df["Name"].apply(lambda x: "(" in x)
df["title"] = df["Name"].apply(title)


In [34]:
# Encode Categorical Features
for col in df.columns:
    if not pd.api.types.is_numeric_dtype(df[col]):
        df[col] = df[col].map({key: val for val, key in enumerate(df[col].unique())})

# Modelling

In [53]:
def modeler(df, target, drop=None, n=1, testSize=0.3, preProcess=None, algo=DummyClassifier(), metric=accuracy_score):
    dfi = df.copy()
    if drop:
        dfi = dfi.drop(columns=drop)
    
    X = dfi.drop(columns=[target])
    y= dfi[target]
    results = []
    
    for train, test in KFold(n_splits=n, shuffle=True, random_state=42).split(X, y):
        X_train, X_test = X.iloc[train], X.iloc[test]
        y_train, y_test = y.iloc[train], y.iloc[test]
        if preProcess:
            X_train = preProcess.fit_transform(X_train)
            X_test = preProcess.transform(X_test)
        algo.fit(X_train, y_train)
        results.append(metric(y_test, algo.predict(X_test)))
    return results

## Batch 1

In [59]:
options = ["ticketNo", "ticketTxt", "cabinLvl", "cabinNo", "surname", "withChild",]
dropsets = []
for i in range(len(options)+1):
    for combo in combinations(options, i):
        combo = list(combo)
        combo.extend(["PassengerId", "Name", "Ticket", "Cabin"])
        dropsets.append(combo)

preProcessors = [
    make_pipeline(StandardScaler(), SimpleImputer()),
    make_pipeline(StandardScaler(), SimpleImputer(strategy="median")),
    make_pipeline(StandardScaler(), SimpleImputer(strategy="most_frequent")),
    make_pipeline(StandardScaler(), SimpleImputer(strategy="constant", fill_value=-1)),
    make_pipeline(RobustScaler(), SimpleImputer()),
    make_pipeline(RobustScaler(), SimpleImputer(strategy="median")),
    make_pipeline(RobustScaler(), SimpleImputer(strategy="most_frequent")),
    make_pipeline(RobustScaler(), SimpleImputer(strategy="constant", fill_value=-1)),
]

algos = [
    GradientBoostingClassifier()
]

configs = []
for dropset in dropsets:
    for preProc in preProcessors:
        for algo in algos:
            configs.append({
                "drop": dropset,
                "preProcess": preProc,
                "algo": algo
            })

In [60]:
ressum = None
resraw = []

for i, config in enumerate(tqdm(configs)):
    results = modeler(df, target="Survived", n=7, **config)
    resraw.append(results)
    summary = pd.DataFrame(results).describe()
    if ressum is None:
        ressum = summary
    else:
        ressum = pd.concat([ressum, summary], axis=1, join="inner")
        ressum.columns = [j for j in range(i + 1)]

  0%|          | 0/512 [00:00<?, ?it/s]

In [68]:
# ressum.transpose()
ressum.transpose().iloc[168]
[config['drop'] for config in configs[168:176]]

[['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin'],
 ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin'],
 ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin'],
 ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin'],
 ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin'],
 ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin'],
 ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin'],
 ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin']]

## Batch 2

In [70]:
dropsets = [
    ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin']
]

preProcessors = [
    make_pipeline(StandardScaler(), SimpleImputer()),
    make_pipeline(StandardScaler(), SimpleImputer(strategy="median")),
    make_pipeline(StandardScaler(), SimpleImputer(strategy="most_frequent")),
    make_pipeline(StandardScaler(), SimpleImputer(strategy="constant", fill_value=-1)),
    make_pipeline(RobustScaler(), SimpleImputer()),
    make_pipeline(RobustScaler(), SimpleImputer(strategy="median")),
    make_pipeline(RobustScaler(), SimpleImputer(strategy="most_frequent")),
    make_pipeline(RobustScaler(), SimpleImputer(strategy="constant", fill_value=-1)),
]

algos = [
    GradientBoostingClassifier(),
    RandomForestClassifier(),
    BernoulliNB(),
    KNeighborsClassifier(),
]

configs = []
for dropset in dropsets:
    for preProc in preProcessors:
        for algo in algos:
            configs.append({
                "drop": dropset,
                "preProcess": preProc,
                "algo": algo
            })

In [71]:
ressum = None
resraw = []

for i, config in enumerate(tqdm(configs)):
    results = modeler(df, target="Survived", n=7, **config)
    resraw.append(results)
    summary = pd.DataFrame(results).describe()
    if ressum is None:
        ressum = summary
    else:
        ressum = pd.concat([ressum, summary], axis=1, join="inner")
        ressum.columns = [j for j in range(i + 1)]

  0%|          | 0/32 [00:00<?, ?it/s]

In [76]:
for i in [16, 0]:
    print(configs[i])

{'drop': ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin'], 'preProcess': Pipeline(steps=[('robustscaler', RobustScaler()),
                ('simpleimputer', SimpleImputer())]), 'algo': GradientBoostingClassifier()}
{'drop': ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin'], 'preProcess': Pipeline(steps=[('standardscaler', StandardScaler()),
                ('simpleimputer', SimpleImputer())]), 'algo': GradientBoostingClassifier()}


## Batch 3

In [77]:
dropsets = [
    ['surname', 'withChild', 'PassengerId', 'Name', 'Ticket', 'Cabin']
]

preProcessors = [
    make_pipeline(StandardScaler(), SimpleImputer()),
    make_pipeline(RobustScaler(), SimpleImputer()),
]

algos = [
    GradientBoostingClassifier(),
]

configs = []
for dropset in dropsets:
    for preProc in preProcessors:
        for algo in algos:
            configs.append({
                "drop": dropset,
                "preProcess": preProc,
                "algo": algo
            })

In [79]:
ressum = None
resraw = []

for i, config in enumerate(tqdm(configs)):
    results = []
    for j in range(30):
        results.extend(modeler(df, target="Survived", n=7, **config))
    resraw.append(results)
    summary = pd.DataFrame(results).describe()
    if ressum is None:
        ressum = summary
    else:
        ressum = pd.concat([ressum, summary], axis=1, join="inner")
        ressum.columns = [j for j in range(i + 1)]

  0%|          | 0/2 [00:00<?, ?it/s]

In [80]:
ressum.transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,210.0,0.845798,0.032096,0.796875,0.826772,0.834646,0.874016,0.905512
1,210.0,0.844635,0.030002,0.796875,0.826772,0.834646,0.874016,0.897638
