In [200]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [264]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (RobustScaler,
                                   StandardScaler,
                                   MinMaxScaler,
                                   OneHotEncoder,
                                   OrdinalEncoder,
                                   FunctionTransformer)
from sklearn.metrics import classification_report
## estimators
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (RandomForestClassifier,
                              HistGradientBoostingClassifier,
                              BaggingClassifier,
                              AdaBoostClassifier)
from xgboost import XGBClassifier

from titanic_challenge.kaggle import interface

# Titanic challenge - kaggle

## Load data

In [342]:
df = pd.read_csv("../data/train.csv")
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [343]:
df.set_index("PassengerId", inplace=True)

In [344]:
df.duplicated().sum()

0

In [345]:
round(df.isnull().sum().sort_values(ascending=False)/len(df)*100)

Cabin       77.0
Age         20.0
Embarked     0.0
Survived     0.0
Pclass       0.0
Name         0.0
Sex          0.0
SibSp        0.0
Parch        0.0
Ticket       0.0
Fare         0.0
dtype: float64

In [346]:
# dropping Cabin and Ticket. Not very useful

#df.drop(columns=["Cabin", "Ticket"], inplace=True)

In [347]:
df.Name.str.extract(r".*,\ (\w*)\.\s").value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Col           2
Major         2
Mlle          2
Capt          1
Don           1
Jonkheer      1
Lady          1
Mme           1
Ms            1
Sir           1
dtype: int64

## Feature creation

### Title

In [375]:
titles_simplified = {
    "Mr":"Mr",
    "Miss":"Miss",
    "Mrs":"Mrs",
    "Master":"Master",
    "Dr":"Mr",
    "Rev":"Mr",
    "Major":"Mr",
    "Mlle":"Miss",
    "Col":"Mr",
    "Don":"Mr",
    "Mme":"Mrs",
    "Ms":"Mrs",
    "Lady":"Mrs",
    "Sir":"Mr",
    "Capt":"Mr",
    "Jonkheer":"Mr",
    "the Countess": "Mrs"
}

def extract_title(df):
    df["Title"] = df.Name.str.extract(r".*,\ (\w*\s?\w*)\.\s")
    # df["Title"].replace(titles_simplified, inplace=True)
    # df["Age"] = df.groupby("Title")["Age"].transform(lambda group : group.fillna(group.median()))
    return df

#### Pipeline

In [376]:
title_transform = FunctionTransformer(extract_title)
title_transform

In [377]:
df = title_transform.fit_transform(df)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamillySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,11.0,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,12.0,2
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,8.0,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,12.0,2
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,11.0,1


In [378]:
df.Title.value_counts()

11.0    517
8.0     182
12.0    125
7.0      40
3.0       7
14.0      6
9.0       2
6.0       2
1.0       2
16.0      1
0.0       1
13.0      1
15.0      1
5.0       1
10.0      1
2.0       1
4.0       1
Name: Title, dtype: int64

### FamillySize

In [372]:
mapping = {
    11: 0,
    8: 1,
    6: 2,
    5: 3,
    1: 4,
    7: 5,
    2: 6,
    3: 7,
    4: 8
}

def create_familly_size(df):
    df["FamillySize"] = df["Parch"] + df["SibSp"] + 1
    df["FamillySize"].replace(mapping)
    return df

In [373]:
familly_transformer = FunctionTransformer(create_familly_size)
familly_transformer

In [425]:
familly_transformer.fit_transform(df[["Parch", "SibSp"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["FamillySize"] = df["Parch"] + df["SibSp"] + 1


Unnamed: 0_level_0,Parch,SibSp,FamillySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,1,2
2,0,1,2
3,0,0,1
4,0,1,2
5,0,0,1
...,...,...,...
887,0,0,1
888,0,0,1
889,2,1,4
890,0,0,1


## Handling missing data

### Age

In [211]:
df["Age"] = df.groupby("Title")["Age"].transform(lambda group : group.fillna(group.mean()))

#### Pipeline

In [360]:
age_transformer = FunctionTransformer(lambda df_ : df_.groupby("Title")["Age"].transform(lambda group: group.fillna(group.mean())))
age_transformer

In [356]:
df["Age"] = age_transformer.fit_transform(df)

In [369]:
# imputer = SimpleImputer(strategy="most_frequent")
# df.Embarked = imputer.fit_transform(df[["Embarked"]])

In [370]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamillySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,2
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,2
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,1


In [371]:
cat_ord_cols = [
    "Pclass", # 3, 2, 1
    "FamillySize", # 11, 8, 6, 5, 1, 7, 2, 3, 4
]

cat_ohe_cols = [
    "Sex",
    "Embarked",
    "Title"
]

num_cols = [
    "Age",
    "Fare"
]

## Ordinal Encoding

### Pclass

In [228]:
# encoder = OrdinalEncoder(categories=[[1, 2, 3]])
df["Pclass"].replace({3: 0, 2: 1, 1: 2}, inplace=True)

### Embarked

In [229]:
df["Embarked"].replace({"Q": 0, "C": 1, "S": 2}, inplace=True)

### Sex

In [230]:
df["Sex"].replace({"female": 1, "male": 0}, inplace=True)

### Pipeline

In [415]:
df

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamillySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S,11.0,2
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,12.0,2
3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S,8.0,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,12.0,2
5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S,11.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S,14.0,1
888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S,8.0,1
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,21.773973,1,2,W./C. 6607,23.4500,,S,8.0,4
890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C,11.0,1


In [416]:
def custom_encoding(df):
    df["Embarked"] = SimpleImputer(strategy="most_frequent").fit_transform(df[["Embarked"]])

    df["Pclass"].replace({3: 0, 2: 1, 1: 2}, inplace=True)
    df["Embarked"].replace({"Q": 0, "C": 1, "S": 2}, inplace=True)
    df["Sex"].replace({"female": 1, "male": 0}, inplace=True)
    return df

custom_encoding_transformer =FunctionTransformer(custom_encoding).set_output(transform="pandas")
custom_encoding_transformer

In [417]:
custom_encoding_transformer.fit_transform(df)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamillySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,0,"Braund, Mr. Owen Harris",0,22.000000,1,0,A/5 21171,7.2500,,2,11.0,2
2,1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,PC 17599,71.2833,C85,1,12.0,2
3,1,0,"Heikkinen, Miss. Laina",1,26.000000,0,0,STON/O2. 3101282,7.9250,,2,8.0,1
4,1,2,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,113803,53.1000,C123,2,12.0,2
5,0,0,"Allen, Mr. William Henry",0,35.000000,0,0,373450,8.0500,,2,11.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
887,0,1,"Montvila, Rev. Juozas",0,27.000000,0,0,211536,13.0000,,2,14.0,1
888,1,2,"Graham, Miss. Margaret Edith",1,19.000000,0,0,112053,30.0000,B42,2,8.0,1
889,0,0,"Johnston, Miss. Catherine Helen ""Carrie""",1,21.773973,1,2,W./C. 6607,23.4500,,2,8.0,4
890,1,2,"Behr, Mr. Karl Howell",0,26.000000,0,0,111369,30.0000,C148,1,11.0,1


In [233]:
df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,FamillySize_encoded
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,0.691358,0.352413,29.754659,32.204208,1.638608,10.241302,4.73064
std,0.486592,0.836071,0.47799,13.277179,49.693429,0.635673,1.830798,1.434469
min,0.0,0.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,21.773973,7.9104,1.0,8.0,4.0
50%,0.0,0.0,0.0,30.0,14.4542,2.0,11.0,4.0
75%,1.0,1.0,1.0,35.898148,31.0,2.0,11.0,6.0
max,1.0,2.0,1.0,80.0,512.3292,2.0,16.0,8.0


## Preproc

In [435]:
preproc = make_column_transformer(
    (title_transform, ["Name", "Title"]),
    (familly_transformer, ["Parch","SibSp"]),
    # (age_transformer, ["Title", "Age"]),
    (custom_encoding_transformer, ["Embarked", "Pclass", "Sex"])
)
preproc

In [434]:
X_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamillySize
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,5.0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,6.0,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,5.0,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,5.0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,6.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,5.0,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1.0,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,5.0,1
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,5.0,1


In [436]:
preproc.fit_transform(X_test)

array([['Kelly, Mr. James', 5.0, 0, ..., 0, 0, 0],
       ['Wilkes, Mrs. James (Ellen Needs)', 6.0, 0, ..., 2, 0, 1],
       ['Myles, Mr. Thomas Francis', 5.0, 0, ..., 0, 1, 0],
       ...,
       ['Saether, Mr. Simon Sivertsen', 5.0, 0, ..., 2, 0, 0],
       ['Ware, Mr. Frederick', 5.0, 0, ..., 2, 0, 0],
       ['Peter, Master. Michael J', 3.0, 1, ..., 1, 0, 0]], dtype=object)

In [422]:
X_test = pd.read_csv("../data/test.csv")

In [424]:
title_transform.fit_transform(X_test)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,FamillySize
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,5.0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S,6.0,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,5.0,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,5.0,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,6.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S,5.0,1
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C,1.0,1
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S,5.0,1
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S,5.0,1


## Train test split

In [254]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0, stratify=y, random_state=42)
# X_train.shape, y_train.shape

InvalidParameterError: The 'test_size' parameter of train_test_split must be a float in the range (0.0, 1.0), an int in the range [1, inf) or None. Got 0 instead.

In [330]:
X = df.drop(columns="Survived")
y = df["Survived"]

## Scaling

In [331]:
scaler = StandardScaler().set_output(transform="pandas")
X = scaler.fit_transform(X)

In [421]:
X

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,Title,FamillySize_encoded
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,-0.827377,-0.737695,-0.584387,-0.502445,0.568837,0.414641,0.885396
2,1.566107,1.355574,0.621365,0.786845,-1.005181,0.961158,0.885396
3,-0.827377,1.355574,-0.282949,-0.488854,0.568837,-1.224909,-0.509631
4,1.566107,1.355574,0.395286,0.420730,0.568837,0.961158,0.885396
5,-0.827377,-0.737695,0.395286,-0.486337,0.568837,0.414641,-0.509631
...,...,...,...,...,...,...,...
887,0.369365,-0.737695,-0.207590,-0.386671,0.568837,2.054191,-0.509631
888,1.566107,1.355574,-0.810466,-0.044381,0.568837,-1.224909,-0.509631
889,-0.827377,1.355574,-0.601421,-0.176263,0.568837,-1.224909,2.280424
890,1.566107,-0.737695,-0.282949,-0.044381,-1.005181,0.414641,-0.509631


## Estimator

In [333]:
model = LogisticRegression()

In [334]:
cross_val_score(model, X, y).mean()

0.8092147385600402

## Pipeline