In [None]:
import pandas as pd

# read titanic data
titanic = pd.read_csv('../../Data/titanic.csv')
titanic

## Encoding categorical features (non-numeric features)

In [None]:
# ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

# create ordinal encoder instance
encoder = OrdinalEncoder()

# fit and transform the data
encoder.fit(titanic[['Sex']])

# transform the data
encoder.transform(titanic[['Sex']]) # male = 0, female = 1

In [None]:
# create ordinal encoder instance
encoder = OrdinalEncoder()

# fit and transform the data
encoder.fit(titanic[['Sex', 'Embarked']])

# transform the data
encoder.transform(titanic[['Sex','Embarked']]) # male = 0, female = 1

In [None]:
encoder.categories_

### Feature types (non numerical)
#### Ordinal: 
- Categorical features that can be sorted or ordered (e.g. educationnal level: high school, college, graduate school)
- Can be encoded as integers (e.g. 0, 1, 2)
- The sklearn OrdinalEncoder can be used to encode ordinal features
    - it assumes categories are ordered, and the distance between each category is the same

#### Nominal:
- Categorical feature with no specific order
- Cannot be encoded as integers
- The sklearn OneHotEncoder can be used to encode nominal features
    - it creates a new binary feature for each category
    - e.g. if there are 3 categories, it creates 3 binary features

In [None]:
# one hot encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False) # set sparse to false to get a numpy array (this allows you to see the column names, which is the encoder categories)

In [None]:
encoder.fit(titanic[['Sex', 'Embarked']])

In [None]:
encoder.categories_

In [None]:
encoder.transform(titanic[['Sex', 'Embarked']])

## Normalization and Standardization
- Normalization: rescaling numerical features to a range of [0, 1]
- Standardization: rescaling numerical features to have a mean of 0 and a standard deviation of 1

These actions omprove the performance of some machine learning models

#### Normalization with sklearn

MinMaxScaler
- The MinMaxScaler rescales features to a range of [0, 1]
- It is sensitive to outliers
- works by subtracting the minimum value and then dividing by the range
- when to use: when the distribution of the feature is not Gaussian or when you want to preserve outliers

MaxAbsScaler
- The MaxAbsScaler rescales features to a range of [-1, 1]
- It is sensitive to outliers
- works by dividing each value by the maximum absolute value in the feature
- when to use: when the distribution of the feature is not Gaussian or when you want to preserve outliers

### Standardization with sklearn

StandardScaler
- The StandardScaler rescales features to have a mean of 0 and a standard deviation of 1
- It is not sensitive to outliers
- works by subtracting the mean and then dividing by the standard deviation
- when to use: when the distribution of the feature is Gaussian

In [None]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler

In [None]:
scaler = MinMaxScaler()
scaler.fit(titanic[['Pclass', 'Age', 'Fare']])

In [None]:
scaler.transform(titanic[['Pclass', 'Age', 'Fare']])

In [None]:
titanic[['Pclass', 'Age', 'Fare']] = scaler.transform(titanic[['Pclass', 'Age', 'Fare']])

### Data Imputation: dealing with missing values

In [None]:
from sklearn.impute import SimpleImputer

Missing values can be imputed (replaced) with: a constant value, the mean, the median, most frequent value, or a value estimated by a machine learning model

In [None]:
titanic.isna().sum()

In [None]:
imputer_numeric = SimpleImputer(strategy='mean')

imputer_numeric.fit(titanic[['Age']])
imputer_numeric.transform(titanic[['Age']]) # replaces missing values with the mean

In [None]:
imputer_categorical = SimpleImputer(strategy='most_frequent')

imputer_categorical.fit(titanic[['Embarked']])
imputer_categorical.transform(titanic[['Embarked']]) # replaces missing values with the most frequent value

### Polynomial Features / Interaction Features

Feature forging like this can help a linear model learn non-linear relationships

Polynomial Features
- Polynomial features are new features created by raising existing features to an exponent
- Polynomial features can be created manually or with sklearn PolynomialFeatures
- can be different degrees
    - d0 = 1 (bias)
    - d1 = x (original feature)
    - d2 = x^2 (original feature squared)
    - d3 = x^3 (original feature cubed)
    - ...

Interaction Features
- Interaction features are new features created by interacting existing features
- Interaction features can be created manually or with sklearn PolynomialFeatures
- can be different degrees
    - d0 = 1 (bias)
    - d1 = x (original feature)
    - d2 = x*y (original feature multiplied by another feature)
    - d3 = x^2*y (original feature squared multiplied by another feature)
    - ...


In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# drop rows where age nan
titanic.dropna(subset=['Age'], inplace=True)

In [46]:
polyfeat = PolynomialFeatures(degree=2)
polyfeat.fit(titanic[['Pclass', 'Age', 'Fare']])
polyfeat.transform(titanic[['Pclass', 'Age', 'Fare']]) # degree 2 leads to 10 features (3 original features + 3 squared features + 3 interaction features + 1 bias term)

array([[1.00000000e+00, 3.00000000e+00, 2.20000000e+01, ...,
        4.84000000e+02, 1.59500000e+02, 5.25625000e+01],
       [1.00000000e+00, 1.00000000e+00, 3.80000000e+01, ...,
        1.44400000e+03, 2.70876540e+03, 5.08130886e+03],
       [1.00000000e+00, 3.00000000e+00, 2.60000000e+01, ...,
        6.76000000e+02, 2.06050000e+02, 6.28056250e+01],
       ...,
       [1.00000000e+00, 1.00000000e+00, 1.90000000e+01, ...,
        3.61000000e+02, 5.70000000e+02, 9.00000000e+02],
       [1.00000000e+00, 1.00000000e+00, 2.60000000e+01, ...,
        6.76000000e+02, 7.80000000e+02, 9.00000000e+02],
       [1.00000000e+00, 3.00000000e+00, 3.20000000e+01, ...,
        1.02400000e+03, 2.48000000e+02, 6.00625000e+01]])

In [51]:
# get polynomial feature names
polyfeat.get_feature_names_out(['Pclass', 'Age', 'Fare'])

array(['1', 'Pclass', 'Age', 'Fare', 'Pclass^2', 'Pclass Age',
       'Pclass Fare', 'Age^2', 'Age Fare', 'Fare^2'], dtype=object)