In [2]:
# --- Import libraries ---
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# --- Load original dataset ---
df = pd.read_csv(r"D:\Project DS Final\2025\check\final_clean_data.csv")  

# --- Select the features and target for classification ---
selected_columns = [
    "MONTH","DAY_OF_MONTH","DAY_OF_WEEK",
    "OP_UNIQUE_CARRIER","ORIGIN","DEST",
    "CRS_DEP_TIME","CRS_ARR_TIME",
    "CRS_ELAPSED_TIME","DISTANCE",
    "HourlyDewPointTemperature","HourlyDryBulbTemperature",
    "HourlyRelativeHumidity","HourlyVisibility","HourlyWindSpeed",
    "DEP_DEL15"   # target column
]

# --- Create a new dataset with only selected columns ---
df_classification = df[selected_columns].copy()

# --- Check the result ---
print("✅ New dataset shape:", df_classification.shape)
print("Columns:", list(df_classification.columns))

# --- Save to new CSV file ---
df_classification.to_csv("classification_data.csv", index=False)
print("📁 Saved as classification_data.csv")


✅ New dataset shape: (50029, 16)
Columns: ['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'DISTANCE', 'HourlyDewPointTemperature', 'HourlyDryBulbTemperature', 'HourlyRelativeHumidity', 'HourlyVisibility', 'HourlyWindSpeed', 'DEP_DEL15']
📁 Saved as classification_data.csv


In [3]:
# --- Load your dataset ---
df = pd.read_csv("classification_data.csv")

# --- Encode MONTH & DAY_OF_WEEK as ordinal numbers (no change needed if numeric) ---
# If they are numeric already (e.g., 4,5,6 for MONTH), we can keep them as-is.

# --- Label encode categorical columns ---
label_cols = ["OP_UNIQUE_CARRIER", "ORIGIN", "DEST"]
le = LabelEncoder()

for col in label_cols:
    df[col] = le.fit_transform(df[col])

# --- Scale numeric columns ---
scale_cols = [
    "CRS_DEP_TIME","CRS_ARR_TIME","CRS_ELAPSED_TIME","DISTANCE",
    "HourlyDewPointTemperature","HourlyDryBulbTemperature",
    "HourlyRelativeHumidity","HourlyVisibility","HourlyWindSpeed"
]

scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])

# --- Save encoded dataset ---
df.to_csv("classification_encoded.csv", index=False)
print("✅ Encoded and scaled dataset saved as classification_encoded.csv")
print(df.head())


✅ Encoded and scaled dataset saved as classification_encoded.csv
   MONTH  DAY_OF_MONTH  DAY_OF_WEEK  OP_UNIQUE_CARRIER  ORIGIN  DEST  \
0      4             1            2                  0       1    62   
1      4             1            2                  0       1    62   
2      4             1            2                  0       4    14   
3      4             1            2                  0       4    14   
4      4             1            2                  0       4    14   

   CRS_DEP_TIME  CRS_ARR_TIME  CRS_ELAPSED_TIME  DISTANCE  \
0     -1.371023     -1.053675         -0.190881 -0.222270   
1      0.322520      0.511390         -0.285164 -0.222270   
2     -1.219136      0.012848          1.353008  1.492567   
3      0.014948      1.171957          1.364794  1.492567   
4      1.566096     -1.646583          1.364794  1.492567   

   HourlyDewPointTemperature  HourlyDryBulbTemperature  \
0                  -1.469068                 -2.530912   
1                  