In [1]:
import pandas as pd

# read titanic data
titanic = pd.read_csv('../../Data/titanic.csv')
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## Encoding categorical features (non-numeric features)

In [2]:
# ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

# create ordinal encoder instance
encoder = OrdinalEncoder()

# fit and transform the data
encoder.fit(titanic[['Sex']])

# transform the data
encoder.transform(titanic[['Sex']]) # male = 0, female = 1

array([[1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [1.],
       [0.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],

In [3]:
# create ordinal encoder instance
encoder = OrdinalEncoder()

# fit and transform the data
encoder.fit(titanic[['Sex', 'Embarked']])

# transform the data
encoder.transform(titanic[['Sex','Embarked']]) # male = 0, female = 1

array([[1., 2.],
       [0., 0.],
       [0., 2.],
       ...,
       [0., 2.],
       [1., 0.],
       [1., 1.]])

In [4]:
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'Q', 'S', nan], dtype=object)]

### Feature types (non numerical)
#### Ordinal: 
- Categorical features that can be sorted or ordered (e.g. educationnal level: high school, college, graduate school)
- Can be encoded as integers (e.g. 0, 1, 2)
- The sklearn OrdinalEncoder can be used to encode ordinal features
    - it assumes categories are ordered, and the distance between each category is the same

#### Nominal:
- Categorical feature with no specific order
- Cannot be encoded as integers
- The sklearn OneHotEncoder can be used to encode nominal features
    - it creates a new binary feature for each category
    - e.g. if there are 3 categories, it creates 3 binary features

In [5]:
# one hot encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False) # set sparse to false to get a numpy array (this allows you to see the column names, which is the encoder categories)

In [6]:
encoder.fit(titanic[['Sex', 'Embarked']])

In [7]:
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['C', 'Q', 'S', nan], dtype=object)]

In [8]:
encoder.transform(titanic[['Sex', 'Embarked']])

array([[0., 1., 0., 0., 1., 0.],
       [1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0.]])

## Normalization and Standardization
- Normalization: rescaling numerical features to a range of [0, 1]
- Standardization: rescaling numerical features to have a mean of 0 and a standard deviation of 1

These actions omprove the performance of some machine learning models

#### Normalization with sklearn

MinMaxScaler
- The MinMaxScaler rescales features to a range of [0, 1]
- It is sensitive to outliers
- works by subtracting the minimum value and then dividing by the range
- when to use: when the distribution of the feature is not Gaussian or when you want to preserve outliers

MaxAbsScaler
- The MaxAbsScaler rescales features to a range of [-1, 1]
- It is sensitive to outliers
- works by dividing each value by the maximum absolute value in the feature
- when to use: when the distribution of the feature is not Gaussian or when you want to preserve outliers

### Standardization with sklearn

StandardScaler
- The StandardScaler rescales features to have a mean of 0 and a standard deviation of 1
- It is not sensitive to outliers
- works by subtracting the mean and then dividing by the standard deviation
- when to use: when the distribution of the feature is Gaussian

In [9]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler

In [10]:
scaler = MinMaxScaler()
scaler.fit(titanic[['Pclass', 'Age', 'Fare']])

In [11]:
scaler.transform(titanic[['Pclass', 'Age', 'Fare']])

array([[1.        , 0.27117366, 0.01415106],
       [0.        , 0.4722292 , 0.13913574],
       [1.        , 0.32143755, 0.01546857],
       ...,
       [1.        ,        nan, 0.04577135],
       [0.        , 0.32143755, 0.0585561 ],
       [1.        , 0.39683338, 0.01512699]])

In [12]:
titanic[['Pclass', 'Age', 'Fare']] = scaler.transform(titanic[['Pclass', 'Age', 'Fare']])

### Data Imputation: dealing with missing values

In [13]:
from sklearn.impute import SimpleImputer

Missing values can be imputed (replaced) with: a constant value, the mean, the median, most frequent value, or a value estimated by a machine learning model

In [14]:
titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [15]:
imputer_numeric = SimpleImputer(strategy='mean')

imputer_numeric.fit(titanic[['Age']])
imputer_numeric.transform(titanic[['Age']]) # replaces missing values with the mean

array([[0.27117366],
       [0.4722292 ],
       [0.32143755],
       [0.43453129],
       [0.43453129],
       [0.36792055],
       [0.67328474],
       [0.01985423],
       [0.33400352],
       [0.17064589],
       [0.04498618],
       [0.72354863],
       [0.24604172],
       [0.48479517],
       [0.17064589],
       [0.68585072],
       [0.01985423],
       [0.36792055],
       [0.3842674 ],
       [0.36792055],
       [0.43453129],
       [0.42196532],
       [0.18321186],
       [0.34656949],
       [0.09525006],
       [0.4722292 ],
       [0.36792055],
       [0.23347575],
       [0.36792055],
       [0.36792055],
       [0.49736115],
       [0.36792055],
       [0.36792055],
       [0.8240764 ],
       [0.34656949],
       [0.52249309],
       [0.36792055],
       [0.25860769],
       [0.22090978],
       [0.17064589],
       [0.49736115],
       [0.33400352],
       [0.36792055],
       [0.03242021],
       [0.23347575],
       [0.36792055],
       [0.36792055],
       [0.367

In [16]:
imputer_categorical = SimpleImputer(strategy='most_frequent')

imputer_categorical.fit(titanic[['Embarked']])
imputer_categorical.transform(titanic[['Embarked']]) # replaces missing values with the most frequent value

array([['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['Q'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['Q'],
       ['S'],
       ['C'],
       ['C'],
       ['Q'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['Q'],
       ['S'],
       ['Q'],
       ['Q'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['C'],
       ['C'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
       ['S'],
      

### Polynomial Features / Interaction Features

Feature forging like this can help a linear model learn non-linear relationships

Polynomial Features
- Polynomial features are new features created by raising existing features to an exponent
- Polynomial features can be created manually or with sklearn PolynomialFeatures
- can be different degrees
    - d0 = 1 (bias)
    - d1 = x (original feature)
    - d2 = x^2 (original feature squared)
    - d3 = x^3 (original feature cubed)
    - ...

Interaction Features
- Interaction features are new features created by interacting existing features
- Interaction features can be created manually or with sklearn PolynomialFeatures
- can be different degrees
    - d0 = 1 (bias)
    - d1 = x (original feature)
    - d2 = x*y (original feature multiplied by another feature)
    - d3 = x^2*y (original feature squared multiplied by another feature)
    - ...


In [17]:
from sklearn.preprocessing import PolynomialFeatures

In [18]:
# drop rows where age nan
titanic.dropna(subset=['Age'], inplace=True)

In [19]:
polyfeat = PolynomialFeatures(degree=2)
polyfeat.fit(titanic[['Pclass', 'Age', 'Fare']])
polyfeat.transform(titanic[['Pclass', 'Age', 'Fare']]) # degree 2 leads to 10 features (3 original features + 3 squared features + 3 interaction features + 1 bias term)

array([[1.00000000e+00, 1.00000000e+00, 2.71173662e-01, ...,
        7.35351548e-02, 3.83739410e-03, 2.00252430e-04],
       [1.00000000e+00, 0.00000000e+00, 4.72229203e-01, ...,
        2.23000420e-01, 6.57039575e-02, 1.93587529e-02],
       [1.00000000e+00, 1.00000000e+00, 3.21437547e-01, ...,
        1.03322097e-01, 4.97217914e-03, 2.39276652e-04],
       ...,
       [1.00000000e+00, 0.00000000e+00, 2.33475748e-01, ...,
        5.45109248e-02, 1.36714293e-02, 3.42881688e-03],
       [1.00000000e+00, 0.00000000e+00, 3.21437547e-01, ...,
        1.03322097e-01, 1.88221292e-02, 3.42881688e-03],
       [1.00000000e+00, 1.00000000e+00, 3.96833375e-01, ...,
        1.57476728e-01, 6.00289552e-03, 2.28825904e-04]])

In [20]:
# get polynomial feature names
polyfeat.get_feature_names_out(['Pclass', 'Age', 'Fare'])

array(['1', 'Pclass', 'Age', 'Fare', 'Pclass^2', 'Pclass Age',
       'Pclass Fare', 'Age^2', 'Age Fare', 'Fare^2'], dtype=object)