In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

Goals:
- Use encoding methods studied previously
- Build a more complicated scikit-learn pipeline
- Discover feature engineering

# 1. Dataset quick outlook and preformating

## 1.1 Data

In [2]:
# load dataset titanic
path_data = '/home/lemasle/PROJECTS/data/Kaggle_Titanic-Machine_Learning_from_Disaster/'
# Some columns include nans, we import them as strings
input_train = pd.read_csv(path_data+'train.csv', na_values=['NaN'])
input_test = pd.read_csv(path_data+'test.csv', na_values=['NaN'])

In [3]:
input_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


PassengerId
Survived: Survival 	 (0 = No, 1 = Yes)\
Pclass:   Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)\
Name:     Name\
Sex:      Sex\
Age:      Age in years\
Sibsp:    Number of siblings/spouses aboard the Titanic\
Parch:    Number of parents / children aboard the Titanic\
Ticket:   Ticket number\
Fare:     Passenger fare\
Cabin:    Cabin number\
Embarked: Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

## 1.2 Preformatting

### 1.2.1 Removing columns

"PassengerId" is an internal label of the dataset - it can be discarded.\
"Ticket"  is simply a label, with no influence on the survival. The ticket label may refer to the location/price of the cabin, but this information is inluded in te "Fare" and "Cabin" columns: So we discard "Ticket as well."\
"Name" should not have any influence on the survival, we decide to discard it.

In [4]:
def remove_columns(df, list_of_columns):

    # list_of_columns = ["PassengerId", "Name", "Ticket"]
    df.drop(columns=list_of_columns, inplace=True)

    return df

### 1.2.2 Feature engineering

Note however that it contains some information, sometimes redundant (Mr, Mrs), sometimes not (Dr, Rev). We will use this information to create a new feature: 'Title'.\
The structure of strings in the first columns is as follows:

[Lastname]  [comma]  [Title]  [dot]  [List of firstnames]

So we can split each string accordingly and select the item between "comma" and "dot" as the title. 

In [5]:
def get_title_feature(df):

    # split info in df['Name'] and get title 
    title = [i.split(",")[1].split(".")[0].strip() for i in df["Name"]]
    # Convert the list in a series and include it in the original dataframe
    df["Title"] = pd.Series(title)

    return df

### 1.2.3 Missing values

In [6]:
input_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### 1.2.3.1 Missing data: some more feature engineering

"Cabin" contains a lot (almost 700) of missing values.\
We could remove the entire column, but we would then loose some (potentially useful) information.\
Instead, we can transform this columns to a boolean indicating whether cabin information is available (1) or not (0).\
This is not much information, but better than nothing.

In [7]:
def cabin_to_boolean(df):
    
    df['Cabin'] = np.where(df['Cabin'].isna(), 0.0, 1.0)
    
    return df

It is possible to transform a function so that it can be easily integrated into a pipeline. This is achieved via the FunctionTransformer command.

In [8]:
from sklearn.preprocessing import FunctionTransformer
CabinTransformer = FunctionTransformer(cabin_to_boolean)

In [9]:
test_cabin = CabinTransformer.transform(input_train)
test_cabin

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,0.0,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,1.0,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,0.0,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,1.0,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,0.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,0.0,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,1.0,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,0.0,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,1.0,C


#### 1.2.3.2 Missing data: categorical data

"Embarked" contains only two missing values.\
We could always consider removing the two corresponding rows, which would have little impact if they correspond to well-represented passenger profiles, but a higher impact if they happen to be rare occurences of a specific profile.\
Instead, we can impute them the most frequent embarkment spot, the hypotheses being that i) we still have a good chance that this arbitrary assignment is correct ii) the model will still perform better with two potentially wrong embarkment spots than with two full passenger profiles deleted.

In [10]:
def fillna_with_most_frequent(df):

    # Get the mode of a set of values, that is the value that appears most often
    embarked_mode = df['Embarked'].mode()
    # fill NaNs with the most common value
    df['Embarked'].fillna(embarked_mode.values[0], inplace=True)

    return df

Note: In the train sample, only "embarked" contains missing data. It could happen that in the test sample, also other categorical columns ('Sex', 'Pclass') contain missing values. We can use the same strategy (most frequent) to replace missing values for those columns.

Instead of using pandas functions, we will take advantage of scikit-learn SimpleImputer:
1. that can be applied to several columns
2. they can be integrated in a scikit-learn pipeline

In [11]:
# Get data for testing the method
test_freq_imputer = input_train[['Sex', 'Pclass', 'Embarked']]
test_freq_imputer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sex       891 non-null    object
 1   Pclass    891 non-null    int64 
 2   Embarked  889 non-null    object
dtypes: int64(1), object(2)
memory usage: 21.0+ KB


In [12]:
#import and define the imputer
from sklearn.impute import SimpleImputer
mostfrequent_imputer = SimpleImputer(strategy='most_frequent')

In [13]:
# fit the imputer on the input data
mostfrequent_imputer.fit(test_freq_imputer)
# impute all missing values in the data.
X_frq = mostfrequent_imputer.transform(test_freq_imputer)
# inspect the results, check that missing values have been replaced
df_frq = pd.DataFrame(X_frq, columns=test_freq_imputer.columns, index=test_freq_imputer.index)
df_frq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sex       891 non-null    object
 1   Pclass    891 non-null    object
 2   Embarked  891 non-null    object
dtypes: object(3)
memory usage: 21.0+ KB


#### 1.2.3.3 Missing data: numerical data

"Age" contains a good number (a bit more than 150) of missing values (NaNs).\
However 2/3rd of the rows contain potentially useful data (very young/very old people probably have a lower chance of survival), so it would be painful to remove the entire column.

One possibility is to fill missing values with the mean or median age (the median is less sensitive to large outliers than the mean).

In [14]:
def fillna_with_median(df):

    # get the median age
    median_age = df['Age'].median()
    # fill NaNs with the median value
    df['Age'].fillna(median_age, inplace=True)
    
    return df

Note: One should always be careful while introducing operations like mean() or median() in the early stages of a pipeline, as it could induce data leakage: rows in the train (respectively test) data would get clues about the ages in the test (respectively train) data. **In this specific example**, the train and test dataset are already split so we are not exposed to this risk.

Note: In the train sample, only "age" contains missing data. It could happen that in the test sample, also other numerical columns ('SibSp', 'Parch', 'Fare') contain missing values. We can use the same strategy (median value) to replace missing values for those columns.

Instead of using pandas functions, we will take advantage of scikit-learn SimpleImputer:
1. that can be applied to several columns
2. they can be integrated in a scikit-learn pipeline

In [15]:
# Get data for testing the method
test_median_imputer = input_train[['Age', 'SibSp', 'Parch', 'Fare']]
test_median_imputer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     714 non-null    float64
 1   SibSp   891 non-null    int64  
 2   Parch   891 non-null    int64  
 3   Fare    891 non-null    float64
dtypes: float64(2), int64(2)
memory usage: 28.0 KB


In [16]:
#import and define the imputer
from sklearn.impute import SimpleImputer
median_imputer = SimpleImputer(strategy='median')

In [17]:
# fit the imputer on the input data
median_imputer.fit(test_median_imputer)
# impute all missing values in the data.
X_med = median_imputer.transform(test_median_imputer)
# inspect the results, check that missing values have been replaced
df_med = pd.DataFrame(X_med, columns=test_median_imputer.columns, index=test_median_imputer.index)
df_med.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     891 non-null    float64
 1   SibSp   891 non-null    float64
 2   Parch   891 non-null    float64
 3   Fare    891 non-null    float64
dtypes: float64(4)
memory usage: 28.0 KB


## 1.3 Encoding

### 1.3.1 Ordinal encoding

We will use ordinal encoding to encode the 'Sex' column, from 'Male'/'Female' to (0,1)

In [18]:
#import and define the encoder
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
# fit and apply the encoder to the data
sex_oe = ordinal_encoder.fit_transform(df_frq[['Sex']]) # Note the double [[ ]] as input data needs to be 2D
sex_oe[:5]

array([[1.],
       [0.],
       [0.],
       [0.],
       [1.]])

We will use ordinal encoding to encode the 'Pclass' column, from (1,2,3) to (0,1,2).\
We want to preserve the order as 2nd class cabins lie in between 1st class cabins and 3rd class cabins.

In [19]:
#import and define the encoder
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
# fit and apply the encoder to the data
pclass_oe = ordinal_encoder.fit_transform(df_frq[['Pclass']]) # Note the double [[ ]] as input data needs to be 2D
pclass_oe[:5]

array([[2.],
       [0.],
       [2.],
       [0.],
       [2.]])

### 1.3.2 One-Hot-Encoding

Although the inaugural trip of the Titanic started in Southampton, then went to Cherbourg, and finally to Queenstown, there is no specific order between the cities, expecially regarding the chances for survival.\
Thus One-Hot-Encoding is the best method for the 'Embarked' column.

In [20]:
#import and define the encoder
from sklearn.preprocessing import OneHotEncoder
embarked_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# fit and apply the encoder to the data
embarked_ohe = embarked_encoder.fit_transform(df_frq[['Embarked']]) # Note the double [[ ]] as input data needs to be 2D
embarked_ohe

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

IMPORTANT NOTE: (handle_unknown=’ignore’ is specified to prevent errors with unseen categories in the testing set\
(categories present in the testing set but not present in the training set).

### 1.3.3 More feature engineering

#### 1.3.3.1: Traveling alone or with families

In [21]:
input_train['SibSp'].value_counts()

SibSp
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: count, dtype: int64

In [22]:
input_train['Parch'].value_counts()

Parch
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: count, dtype: int64

Both "Sibsp" (number of siblings/spouses aboard) and "Parch" (number of parents/children aboard) contain 7 unique categories, but data are concentrated in 2+3 highly populated categories while some others contain only a handul objects.\
There is therefore a risk of overfitting. More importantly, encoding via one-hot-encoding would replace two columns with 14 new ones, and such a strong increase of dimensionality is probably not a good thing.\
For the sake of the exercise, we will also perform some feature engineering here, simply asking whether people travel alone or not.

In [23]:
def Is_Alone(df):

    df['Is_Alone'] = np.where( (df['SibSp'] + df['Parch'])>0, 0.0, 1.0)

    return df

In [24]:
Is_Alone(input_train)['Is_Alone'].value_counts()

Is_Alone
1.0    537
0.0    354
Name: count, dtype: int64

In [31]:
# It is not useful to import FunctionTransformer again
# I show it again for clarity 
from sklearn.preprocessing import FunctionTransformer
FamilyTransformer = FunctionTransformer(Is_Alone)

#### 1.3.3.2: Revisiting and encoding 'Title'

In [25]:
test_title = pd.DataFrame(get_title_feature(input_train), columns=['Title'])
test_title['Title'].value_counts()

Title
Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: count, dtype: int64

A number of those are duplicates due to convention (Ms=Mrs) or language (Mme=Mrs, Don=Sir). We will simplify the 'Title' column by mapping a dictionnary before encoding it.

In [26]:
input_train.loc[input_train.Title=='Master']['Age'].unique()

array([ 2.  ,  7.  , 11.  ,  4.  ,   nan,  0.83, 12.  ,  1.  ,  9.  ,
        3.  ,  0.92,  8.  ,  6.  ,  0.67,  0.42, 10.  ])

If we inspect ages of passengers with title 'Master', we discover that all of them are kids.\
In passing, the presence of NaNs indicates that we could do a more clever job imputing ages than simply taking the median.

In [27]:
Title_dict = {'Mr': 'Mr',
              'Miss' : 'Miss',
              'Mrs' : 'Mrs',
              'Master' : 'Master',
              'Dr' : 'Other',
              'Rev' : 'Other',
              'Mlle' : 'Miss',
              'Major' : 'Other',
              'Col' : 'Other',
              'the Countess' : 'Noble',
              'Capt' : 'Other',
              'Ms' : 'Mrs',
              'Sir' : 'Noble',
              'Lady' : 'Noble',
              'Mme' : 'Mrs',
              'Don' : 'Noble',
              'Jonkheer' : 'Noble',
              }

In [28]:
test_title['Title'] = test_title['Title'].map(Title_dict)
test_title['Title'].value_counts()

Title
Mr        517
Miss      184
Mrs       127
Master     40
Other      18
Noble       5
Name: count, dtype: int64

In [29]:
#import and define the encoder
from sklearn.preprocessing import OneHotEncoder
titles_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# fit and apply the encoder to the data
titles_ohe = titles_encoder.fit_transform(test_title[['Title']]) # Note the double [[ ]] as input data needs to be 2D
print(np.shape(titles_ohe))
titles_ohe

(891, 6)


array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       ...,
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

## 1.4 Summary

- Feature engineering
 1. get_title_feature
 2. Is_Alone
- Missing data
 1. fillna_with_most_frequent
 2. fillna_with_median
- Encoding
 1. sex_oe
 2. Pclass_oe
 3. cabin_to_boolean
 4. embarked_ohe
 5. titles_ohe

- remove columns (those not considered at all or modified)

# 2. Pipeline building: columns transformations

In order to group all the individual steps listed above, we will have to:
 1. Include each of them inside a Pipeline instance
 2. Group these pipeline instances using ColumnTransformer 

In [32]:
from sklearn.pipeline import Pipeline

# cabin_pipeline applies to 'Cabin'
# the column is replaced
cabin_pipeline = Pipeline(steps=[
    ('HasCabin',CabinTransformer())
    ])
cabin_cols = ['Cabin']
    
# family_pipeline applies to 'Sibsp', 'Parch'
# these columns are not used anymore    
family_pipeline= Pipeline(steps=[
    ('HasFamily',FamilyTransformer())
    ])
family_cols = ['Sibsp', 'Parch']

# num_pipeline applies to 'Age', 'Fare'
# Missing values are median-imputed
# We will also apply a standard scaler
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scale',MinMaxScaler())
    ])
num_cols = ['Age', 'Fare']    

SyntaxError: invalid syntax. Perhaps you forgot a comma? (3689844948.py, line 6)

In [None]:
#TBD columns to remove: PassengerId 'Sibsp', 'Parch'

In [None]:
num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale',MinMaxScaler())

In [None]:
### 2.3 Using ColumnTransfomer to group the two branches of the pipeline

The syntax of a ColumnTransformer is as follows:

ColumnTransformer(transformers=[(‘step name’, transform function,cols), …])

In [None]:
from sklearn.compose import ColumnTransformer

col_trans = ColumnTransformer(transformers=[
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)
    ],
    remainder='drop',
    n_jobs=-1)

Note 1: remainder=’drop’ is specified to ignore the other dataframe columns.\
Note 2: n_job = -1 to use all processors in parallel.

In [30]:
stop

NameError: name 'stop' is not defined

In [None]:
get_title_feature(input_train) # TBD do feature engineering here tooo

In [None]:
def preformat(df_init):
    df = df_init.copy()
    df.loc[:,'relevent_experience'] = df['relevent_experience'].map(relevant_experience_map)
    df.loc[:,'last_new_job'] = df['last_new_job'].map(last_new_job_map)
    df.loc[:,'experience'] = df['experience'].map(experience_map)

    return dfdf.drop

In [None]:
from sklearn.preprocessing import FunctionTransformer
preformater = FunctionTransformer(preformat)
test = preformater.transform(data)