# Feature Scaling - Flight cancellation

This notebook explores **2 approaches** to scale numeric features:
1. **StandardScaler** - Standardize features to zero mean and unit variance
2. **MinMaxScaler** - Scale features to a fixed range [0, 1]

Each approach is evaluated using Naive Bayes and KNN classifiers.

In [None]:
from pandas import read_csv, DataFrame, Series
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
input_df: DataFrame = read_csv("../../data/prepared/flight_mvi.csv", na_values="")
target = "Cancelled"
target_name = "Cancelled"
file_tag = "flight"
metric="f1"
lab_folder_sca = "lab3_preparation/scaling"

FileNotFoundError: [Errno 2] No such file or directory: '../../data/prepared/flight_mvi.csv'

In [None]:
approach_sca1 = "Standard-Scaler"
data_sca1 = input_df.copy()

vars: list[str] = data_sca1.columns.to_list()
target_data: Series = data_sca1.pop(target)

transf: StandardScaler = StandardScaler(with_mean=True, with_std=True, copy=True).fit(data_sca1)

df_zscore = DataFrame(transf.transform(data), index=data.index)
df_zscore[target] = target_data
df_zscore.columns = vars
df_zscore.to_csv(f"../../data/prepared/pre_scaled_zscore.csv", index="id")

In [None]:
evaluate_and_plot(data_sca1, lab_folder_sca, file_tag, approach_sca2, target_name, metric=metric)

In [None]:
approach_sca1 = "MinMax-Scaler"
data_sca2 = input_df.copy()

vars: list[str] = data_sca2.columns.to_list()
target_data: Series = data_sca2.pop(target)

transf: MinMaxScaler = MinMaxScaler(feature_range=(0, 1), copy=True).fit(data)
df_minmax = DataFrame(transf.transform(data), index=data.index)
df_minmax[target] = target_data
df_minmax.columns = vars
df_minmax.to_csv(f"../../data/prepared/flight_scaled_minmax.csv", index="id")

In [None]:
evaluate_and_plot(data_sca2, lab_folder_sca, file_tag, approach_sca2, target_name, metric=metric)

In [None]:
# Uncomment to save individual results
data_sca1.to_csv(f"../../data/prepared/{file_tag}_scaling_{approach_sca1}.csv", index=False)
data_sca2.to_csv(f"../../data/prepared/{file_tag}_scaling_{approach_sca2}.csv", index=False)

best_scaling_df = data_sca1.copy()
print(f"Best approach: {approach_sca1}")
print(f"Shape: {best_scaling_df.shape}")