In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv(r"D:\Project DS Final\Data\final_clean_data.csv")

selected_columns = [
    "MONTH","DAY_OF_MONTH","DAY_OF_WEEK",
    "OP_UNIQUE_CARRIER","ORIGIN","DEST",
    "CRS_DEP_TIME","CRS_ARR_TIME",
    "CRS_ELAPSED_TIME","DISTANCE",
    "HourlyDewPointTemperature","HourlyDryBulbTemperature",
    "HourlyRelativeHumidity","HourlyVisibility","HourlyWindSpeed",
    "DEP_DEL15"
]
df_classifi = df[selected_columns].copy()
print("Shape:", df_classifi.shape)

Shape: (45155, 16)


In [3]:
label_cols = ["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"]
le = LabelEncoder()

for col in label_cols:
    df_classifi[col] = le.fit_transform(df_classifi[col])

display(df_classifi.head())

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,ORIGIN,DEST,CRS_DEP_TIME,CRS_ARR_TIME,CRS_ELAPSED_TIME,DISTANCE,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyRelativeHumidity,HourlyVisibility,HourlyWindSpeed,DEP_DEL15
0,4,1,2,0,1,62,6.53,9.33,168.0,1020.0,33.0,36.0,89.0,5.0,10.0,0.0
1,4,1,2,0,1,62,15.45,18.12,160.0,1020.0,33.0,45.0,63.0,10.0,10.0,1.0
2,4,1,2,0,4,14,7.33,15.32,299.0,2279.0,37.0,43.0,80.0,10.0,6.0,0.0
3,4,1,2,0,4,14,13.83,21.83,300.0,2279.0,41.0,49.0,74.0,10.0,6.0,0.0
4,4,1,2,0,4,14,22.0,6.0,300.0,2279.0,41.0,45.0,86.0,10.0,7.0,0.0


In [4]:
scale_cols = [
    "CRS_DEP_TIME","CRS_ARR_TIME","CRS_ELAPSED_TIME","DISTANCE",
    "HourlyDewPointTemperature","HourlyDryBulbTemperature",
    "HourlyRelativeHumidity","HourlyVisibility","HourlyWindSpeed"
]

scaler = StandardScaler()
scaled = scaler.fit_transform(df_classifi[scale_cols])
df_classifi[scale_cols] = scaled

In [5]:
X = df_classifi.drop("DEP_DEL15", axis=1)
y = df_classifi["DEP_DEL15"]

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

df_classifi = pd.concat(
    [pd.DataFrame(X_res, columns=X.columns), y_res.reset_index(drop=True)],
    axis=1
)

df_classifi.shape
print("Class distribution:\n", y_res.value_counts(normalize=True))

Class distribution:
 DEP_DEL15
0.0    0.5
1.0    0.5
Name: proportion, dtype: float64


In [6]:
num_cols = [
    "CRS_DEP_TIME","CRS_ARR_TIME","CRS_ELAPSED_TIME","DISTANCE",
    "HourlyDewPointTemperature","HourlyDryBulbTemperature",
    "HourlyRelativeHumidity","HourlyVisibility","HourlyWindSpeed"
]

for col in num_cols:
    Q1 = df_classifi[col].quantile(0.25)
    Q3 = df_classifi[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.75 * IQR   # dùng 1.75 thay vì 1.5 để clip nhẹ hơn
    upper = Q3 + 1.75 * IQR
    df_classifi[col] = df_classifi[col].clip(lower, upper)

In [7]:
df_classifi.to_csv(r"D:\Project DS Final\Data\classi_mod_data.csv", index=False)