In [1]:
# modules we'll use
import pandas as pd
import numpy as np

# for Box-Cox Transformation
from scipy import stats

# for min_max scaling
from mlxtend.preprocessing import minmax_scaling

# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt

# set seed for reproducibility
np.random.seed(0)

## Difference between Scaling and Normalisation

- Scaling: Changing the range of the data.
    - For example, changing the range from 0-8 to 0-1.

- Normalisation: Changing the shape of the distribution of the data.
    - We must change the data such that it can be described as a normal distribution.
    - Normal Distribution:
        - About the same amount of observations fall on either side of the mean (which is the same as the median).
        - More observations are closer to the mean.
        - Also known as the Bell Curve or the Gaussian Distribution.

Normalisation becomes necessary if we are using ML algorithms that assume that the data is normally distributed.

Example of Scaling (kaggle):

In [None]:
# generate 1000 data points randomly drawn from an exponential distribution
original_data = np.random.exponential(size=1000)

# mix-max scale the data between 0 and 1
scaled_data = minmax_scaling(original_data, columns=[0])

# plot both together to compare
fig, ax = plt.subplots(1, 2, figsize=(15, 3))
sns.histplot(original_data, ax=ax[0], kde=True, legend=False)
ax[0].set_title("Original Data")
sns.histplot(scaled_data, ax=ax[1], kde=True, legend=False)
ax[1].set_title("Scaled data")
plt.show()

Example of Normalisaton:

In [None]:
# normalize the exponential data with boxcox
normalized_data = stats.boxcox(original_data)

# plot both together to compare
fig, ax=plt.subplots(1, 2, figsize=(15, 3))
sns.histplot(original_data, ax=ax[0], kde=True, legend=False)
ax[0].set_title("Original Data")
sns.histplot(normalized_data[0], ax=ax[1], kde=True, legend=False)
ax[1].set_title("Normalized data")
plt.show()