# Load Data

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
TRAIN_CSV = 'train.csv'
TEST_CSV = 'test.csv'

def load_data(csv_name):
    csv_path = os.path.join('datasets', csv_name)
    return pd.read_csv(csv_path)

In [3]:
train_set = load_data(TRAIN_CSV)
test_set = load_data(TEST_CSV)

In [4]:
X_train = train_set.drop(columns='Survived')
y_train = train_set['Survived']

In [5]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Name         891 non-null    object 
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        204 non-null    object 
 10  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 76.7+ KB


In [7]:
def make_deck_keep_cabin(df: pd.DataFrame) -> pd.DataFrame:
    s = df.iloc[:, 0].astype('string').str[0]      #astype string converts to pandas string datatype. this takes care of the Nan floats.
    return pd.DataFrame({'Cabin':df.iloc[:, 0], 'Deck': s}, index=df.index)   # returns a df comprising of both the original cabin and deck columns.

In [50]:
def sibsp_parch_binary(df: pd.DataFrame) -> pd.DataFrame:
    out_df = (df > 0).astype(int)
    out_df.columns = df.columns
    return out_df

#### title pipeline

In [14]:
def make_title(df: pd.DataFrame) -> pd.DataFrame:
    s = df.iloc[:, 0].str.split(' ').apply(lambda x: x[1])
    return pd.DataFrame({'Title': s}, index=df.index)

In [71]:
def group_title(df: pd.DataFrame) -> pd.DataFrame:
    s = df.iloc[:, 0].replace({
        'Capt.':'Military', 
        'Major.':'Military',
        'Col.':'Military',
        'Jonkheer.':'Noble',
        'the Countess':'Noble',
        'Don.':'Noble',
        'Lady':'Noble',
        'Sir':'Noble',
        'Mlle.':'Noble',
        'Ms.':'Noble',
        'Mme.':'Noble'
    })

    return pd.DataFrame({'Title': s}, index=df.index)

In [72]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

title_pipeline = Pipeline([
    ('make_title', FunctionTransformer(func=make_title, validate=False, feature_names_out=lambda self, input_features:['Title'])),
    ('group_title', FunctionTransformer(func=group_title, validate=False, feature_names_out='one-to-one'))
])

#### Fare Pipeline

In [83]:
def fare_per_ticket(df:pd.DataFrame) -> pd.DataFrame:
    fare = df.iloc[:, 0]
    ticket = df.iloc[:, 1]
    ticket_counts = ticket.map(ticket.value_counts())
    fare_per_ticket = fare / ticket_counts
    return pd.DataFrame({'fare_per_ticket': fare_per_ticket, 'Ticket': ticket}, index=df.index)

In [130]:
fare_pipeline = Pipeline([
    ('fare_per_ticket', FunctionTransformer(func=fare_per_ticket, validate=False, feature_names_out=lambda self, input_features:['fare_per_ticket','Ticket'])),
    ('cut_fare', FunctionTransformer(func=fare_cut, validate=False,
                                    kw_args={'bins': [0, 6, 13, 20, 40, 80, 222],
                                             'labels': [0, 1, 2, 3, 4, 5]},
                                     feature_names_out='one-to-one'))
])

def shared_cabin_ticket(df:pd.DataFrame) -> pd.DataFrame:
    ss = df.apply(df.value_counts())
    return pd.DataFrame({'shared_cabin': ss.iloc[:, 0], 'shared_ticket': ss.iloc[:, 1]}, index=df.index)

In [96]:
def shared_cabin_ticket(df: pd.DataFrame) -> pd.DataFrame:
    # Expect df with two columns: [Cabin, Ticket]
    cabin  = df.iloc[:, 0].astype("string")
    ticket = df.iloc[:, 1].astype("string")

    shared_cabin = cabin.map(cabin.value_counts().gt(1)).fillna(False).astype(int)
    shared_ticket = ticket.map(ticket.value_counts().gt(1)).fillna(False).astype(int)

    return pd.DataFrame(
        {"shared_cabin": shared_cabin, "shared_ticket": shared_ticket},
        index=df.index,
    )

In [68]:
def age_cut(df:pd.DataFrame, bins, labels) -> pd.DataFrame:
    s = df.iloc[:, 0]
    cats = pd.cut(s, bins = bins, labels = labels, include_lowest=True, right=True)
    cats.name = s.name
    return pd.DataFrame(cats, index=df.index)

In [92]:
def fare_cut(df:pd.DataFrame, bins, labels) -> pd.DataFrame:
    s = df.iloc[:, 0]
    cats = pd.cut(s, bins = bins, labels = labels, include_lowest=True, right=True)
    cats.name = s.name
    return pd.DataFrame({'fare_per_ticket': cats, 'Ticket': df.iloc[:, 1]}, index=df.index)

In [136]:
from sklearn.compose import ColumnTransformer

features_ct = ColumnTransformer(
    transformers = [
        #Not one-to-one
        ('make_deck_keep_cabin', FunctionTransformer(func=make_deck_keep_cabin, validate=False, feature_names_out = lambda self, input_features:['Cabin', 'Deck']), ['Cabin']),
        ('SibSp_Parch_binary', FunctionTransformer(func=sibsp_parch_binary, validate=False, feature_names_out = 'one-to-one'), ['SibSp', 'Parch']),
        ('title_pipeline', title_pipeline, ['Name']),
        ('fare_pipeline', fare_pipeline, ['Fare', 'Ticket']),
        ('shared_cabin_ticket', FunctionTransformer(func=shared_cabin_ticket, validate=False, feature_names_out=lambda self, input_features:['shared_cabin', 'shared_ticket']), ['Cabin', 'Ticket']),
        ('age_cut', FunctionTransformer(func=age_cut, validate=False, feature_names_out='one-to-one', kw_args={'bins': [0, 12, 17, 24, 44, 64, float('inf')], 'labels': [0, 1, 2, 3, 4, 5]}), ['Age'])
    ],
    remainder='passthrough',
    verbose_feature_names_out=False
).set_output(transform='pandas')

In [137]:
dropper = FunctionTransformer(
    lambda df: df.drop(columns=["Cabin", "Ticket", "PassengerId"]),
    validate=False,
    feature_names_out=lambda self, in_feats: [c for c in in_feats if c not in ["Cabin","Ticket","PassengerId"]],
)

features_pipeline = Pipeline([
    ("features", features_ct),
    ("dropper", dropper),
])

In [138]:
X_train1 = features_pipeline.fit_transform(X_train)

In [139]:
X_train1

Unnamed: 0,Deck,SibSp,Parch,Title,fare_per_ticket,shared_cabin,shared_ticket,Age,Pclass,Sex,Embarked
0,,1,0,Mr.,1,0,0,2,3,male,S
1,C,1,0,Mrs.,4,0,0,3,1,female,C
2,,0,0,Miss.,1,0,0,3,3,female,S
3,C,1,0,Mrs.,3,1,1,3,1,female,S
4,,0,0,Mr.,1,0,0,3,3,male,S
...,...,...,...,...,...,...,...,...,...,...,...
886,,0,0,Rev.,1,0,0,3,2,male,S
887,B,0,0,Miss.,3,0,0,2,1,female,S
888,,1,1,Miss.,1,0,1,,3,female,S
889,C,0,0,Mr.,3,0,0,3,1,male,C


In [140]:
X_train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Deck             204 non-null    string  
 1   SibSp            891 non-null    int64   
 2   Parch            891 non-null    int64   
 3   Title            891 non-null    object  
 4   fare_per_ticket  891 non-null    category
 5   shared_cabin     891 non-null    int64   
 6   shared_ticket    891 non-null    int64   
 7   Age              714 non-null    category
 8   Pclass           891 non-null    int64   
 9   Sex              891 non-null    object  
 10  Embarked         889 non-null    object  
dtypes: category(2), int64(5), object(3), string(1)
memory usage: 64.9+ KB


In [141]:
one_hot_cols = ['Deck', 'Title', 'Sex', 'Embarked']
impute_cols = ['Deck', 'Age', 'Embarked']

In [144]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

impute_encode = ColumnTransformer([
    ('impute_cat', SimpleImputer(strategy='most_frequent'), impute_cols),
    ('one_hot', OneHotEncoder(), one_hot_cols),
])

In [159]:
def fix_missing(df):
    # Replace pd.NA with np.nan everywhere
    df = df.replace({pd.NA: np.nan})
    # Force pandas extension types -> numpy dtypes
    for col in df.columns:
        if pd.api.types.is_string_dtype(df[col]) or pd.api.types.is_object_dtype(df[col]):
            df[col] = df[col].astype("object")  # object dtype with np.nan
        elif pd.api.types.is_integer_dtype(df[col]):
            df[col] = df[col].astype("float")   # float dtype with np.nan
    return df

nan_fixer = FunctionTransformer(fix_missing, validate=False)

In [163]:
preprocessing = Pipeline([
    ('nan_fixer', nan_fixer),
    ('add_remove_features_pipeline', features_pipeline),
    ('impute_encode_ct', impute_encode)
])

In [164]:
X_train2 = preprocessing.fit_transform(X_train)

TypeError: boolean value of NA is ambiguous