In [None]:
import pandas as pd

# preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler, PolynomialFeatures

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

# model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# evaluation
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# visualization
import matplotlib.pyplot as plt

In [None]:
# load titanic data
t_data = pd.read_csv('../../Data/titanic.csv')

| Variable  | Definition                                       | Key                                        |
|-----------|-------------------------------------------------|--------------------------------------------|
| survival  | Survival                                         | 0 = No, 1 = Yes                            |
| pclass    | Ticket class                                     | 1 = 1st, 2 = 2nd, 3 = 3rd                  |
| sex       | Sex                                              |                                            |
| Age       | Age in years                                     |                                            |
| sibsp     | # of siblings / spouses aboard the Titanic       |                                            |
| parch     | # of parents / children aboard the Titanic       |                                            |
| ticket    | Ticket number                                    |                                            |
| fare      | Passenger fare                                   |                                            |
| cabin     | Cabin number                                     |                                            |
| embarked  | Port of Embarkation                              | C = Cherbourg, Q = Queenstown, S = Southampton |


### Function Transformer
Makes functions compatible with scikit-learn pipelines.

We are going to add two new features:
- Title (Mr, Dr, etc)
- Family size (number of siblings, spouses, parents and children)

In [None]:
# add title feature
# t_data['Title'] = t_data['Name'].str.split(',').str[1].str.split('.').str[0].str.strip()
# titles = t_data.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
# t_data['Title'] = titles

In [None]:
def get_title(data):
    df = data.copy() # avoid changing the original data
    df['Title'] = df.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
    most_common_titles = df['Title'].value_counts().head(6).index.tolist() # get the 6 most common titles
    df.loc[~df['Title'].isin(most_common_titles), 'Title'] = 'Other' # replace less common titles with 'Other', faster than apply + lambda
    return df

In [None]:
# check if the function works
get_title(t_data)

In [None]:
get_title_transformer = FunctionTransformer(get_title)

In [None]:
get_title_transformer.fit_transform(t_data)

In [None]:
# get family size function
def get_family_size(data):
    df = data.copy()
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    return df

In [None]:
# check if the function works
get_family_size(t_data)

In [None]:
get_family_size_transformer = FunctionTransformer(get_family_size)

In [None]:
get_family_size_transformer.fit_transform(t_data)

In [None]:
# group by family name
t_data['Name'].str.split(',').str[0]
t_data['FamilyName'] = t_data['Name'].str.split(',').str[0]
t_data['FamilyName'].value_counts()

### Pipeline and ColumnTransformer
Pipeline applies sequentially a list of transformations.

ColumnTransformer applies in parallel a list of transformations to selected columns.

In [27]:
# numerical features pipeline
numerical_features = ['Age', 'Fare', 'FamilySize']#, 'Pclass']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])
numerical_transformer

In [28]:
# categorical features pipeline
categorical_features = ['Sex', 'Embarked', 'Title']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
categorical_transformer

In [29]:
# column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)
preprocessor

In [30]:
# apply preprocessing to the data
preprocessor.fit_transform(t_data)

ValueError: A given column is not a column of the dataframe