In [11]:
import pandas as pd
import numpy as np

In [12]:
# ===============================
# Filter dataset: only keep flights originating from Washington (WA)
# ===============================

# Read your dataset (replace file name if needed)
df = pd.read_csv(r'D:\Project DS Final\2024\preprocessing\data_clean_final_2024.csv')

# Filter rows where ORIGIN_STATE_ABR == 'WA'
df_wa = df[df['ORIGIN_STATE_ABR'] == 'WA']

# Reset index for clarity
df_wa = df_wa.reset_index(drop=True)

# Display result
print("Number of flights originating from WA:", len(df_wa))
print(df_wa.head())

# (Optional) Save the filtered dataset
df_wa.to_csv("clean_data_WA.csv", index=False)
print("✅ Saved filtered dataset as 'clean_data_WA.csv'")


Number of flights originating from WA: 49168
   MONTH  DAY_OF_MONTH  DAY_OF_WEEK OP_UNIQUE_CARRIER ORIGIN ORIGIN_STATE_ABR  \
0      4             1            1                AA    GEG               WA   
1      4             1            1                AA    GEG               WA   
2      4             1            1                AA    GEG               WA   
3      4             1            1                AA    GEG               WA   
4      4             1            1                AA    SEA               WA   

  DEST DEST_STATE_ABR  CRS_DEP_TIME  DEP_DELAY  DEP_DELAY_NEW  DEP_DEL15  \
0  DFW             TX          5.25       11.0           11.0        0.0   
1  DFW             TX         13.32       12.0           12.0        0.0   
2  PHX             AZ          6.75       -6.0            0.0        0.0   
3  PHX             AZ         15.12       -6.0            0.0        0.0   
4  CLT             NC          6.00       -2.0            0.0        0.0   

   DEP_DELA

In [13]:
# ========================================
# Select specific features for encoding
# ========================================

import pandas as pd

# Read your dataset (change file name if needed)
df = pd.read_csv("clean_data_WA.csv")

# ---- Choose only the features you want to keep ----
selected_features = [
    'MONTH',
    'DAY_OF_MONTH',
    'DAY_OF_WEEK',
    'OP_UNIQUE_CARRIER',
    'ORIGIN',
    'DEST',
    'CRS_DEP_TIME',
    'TAXI_OUT',
    'CRS_ELAPSED_TIME',
    'DISTANCE',
    'DEP_DELAY'
]

# Filter dataset
df_selected = df[selected_features]

# ---- Save the filtered dataset ----
df_selected.to_csv("clean_data_encode.csv", index=False)

print("✅ Saved dataset with selected features as 'clean_data_encode.csv'")
print("Shape:", df_selected.shape)
print(df_selected.head())


PermissionError: [Errno 13] Permission denied: 'clean_data_encode.csv'

In [None]:
# ==========================================
# Cyclical + OneHot Encoding for Time Features
# ==========================================

In [None]:
# Read dataset
df = pd.read_csv("clean_data_encode.csv")

In [None]:
# ---- 1. Cyclical encoding for DAY_OF_MONTH (1–31) ----
df['DAY_OF_MONTH_sin'] = np.sin(2 * np.pi * df['DAY_OF_MONTH'] / 31)
df['DAY_OF_MONTH_cos'] = np.cos(2 * np.pi * df['DAY_OF_MONTH'] / 31)

# ---- 2. Cyclical encoding for DAY_OF_WEEK (1–7) ----
df['DAY_OF_WEEK_sin'] = np.sin(2 * np.pi * df['DAY_OF_WEEK'] / 7)
df['DAY_OF_WEEK_cos'] = np.cos(2 * np.pi * df['DAY_OF_WEEK'] / 7)

# ---- 3. Cyclical encoding for CRS_DEP_TIME (0–24h) ----
# Ensure CRS_DEP_TIME is within [0, 24)
df['CRS_DEP_TIME_sin'] = np.sin(2 * np.pi * df['CRS_DEP_TIME'] / 24)
df['CRS_DEP_TIME_cos'] = np.cos(2 * np.pi * df['CRS_DEP_TIME'] / 24)

# ---- 4. One-Hot encoding for MONTH (only 3 unique values) ----
month_dummies = pd.get_dummies(df['MONTH'], prefix='MONTH', dtype=int)

# ---- 5. Concatenate all new columns ----
df_encoded = pd.concat([df, month_dummies], axis=1)

# ---- 6. Drop original time columns (optional) ----
df_encoded = df_encoded.drop(columns=['MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'CRS_DEP_TIME'])

# ---- 7. Save the encoded dataset ----
df_encoded.to_csv("encoded_time_features.csv", index=False)

print("✅ Time features encoded successfully!")
print("New shape:", df_encoded.shape)
print(df_encoded.head())

✅ Time features encoded successfully!
New shape: (49168, 16)
  OP_UNIQUE_CARRIER ORIGIN DEST  TAXI_OUT  CRS_ELAPSED_TIME  DISTANCE  \
0                AA    GEG  DFW      13.0               211      1477   
1                AA    GEG  DFW      16.0               223      1477   
2                AA    GEG  PHX      13.0               173      1020   
3                AA    GEG  PHX      13.0               173      1020   
4                AA    SEA  CLT      29.0               306      2279   

   DEP_DELAY  DAY_OF_MONTH_sin  DAY_OF_MONTH_cos  DAY_OF_WEEK_sin  \
0       11.0          0.201299           0.97953         0.781831   
1       12.0          0.201299           0.97953         0.781831   
2       -6.0          0.201299           0.97953         0.781831   
3       -6.0          0.201299           0.97953         0.781831   
4       -2.0          0.201299           0.97953         0.781831   

   DAY_OF_WEEK_cos  CRS_DEP_TIME_sin  CRS_DEP_TIME_cos  MONTH_4  MONTH_5  \
0        

In [None]:
# ==========================================
# Count frequency of each DEST and export to CSV
# ==========================================

import pandas as pd

# Read your dataset (change filename if needed)
df = pd.read_csv("encoded_time_features.csv")

# ---- Count number of flights per DEST ----
dest_counts = df['DEST'].value_counts().reset_index()

# Rename columns for clarity
dest_counts.columns = ['DEST', 'count']

# ---- Sort from least to most frequent ----
dest_counts = dest_counts.sort_values(by='count', ascending=True)

# ---- Export to CSV ----
dest_counts.to_csv("dest_value_counts.csv", index=False)

print("✅ Saved DEST frequency statistics to 'dest_value_counts.csv'")
print(dest_counts.head())
print("\nTop DESTs:\n", dest_counts.tail())


✅ Saved DEST frequency statistics to 'dest_value_counts.csv'
   DEST  count
84  JAC     22
83  RSW     37
82  GTF     45
81  MRY     58
80  MSO     74

Top DESTs:
   DEST  count
4  LAS   2158
3  SEA   2176
2  LAX   2182
1  PHX   2335
0  DEN   2478


In [None]:
# ==========================================
# Filter dataset: keep only DEST with more than 789 records
# ==========================================
import pandas as pd

# Read your dataset (change file name if needed)
df = pd.read_csv("encoded_time_features.csv")

# ---- Count number of flights per DEST ----
dest_counts = df['DEST'].value_counts()

# ---- Get list of DEST with count > 789 ----
valid_dests = dest_counts[dest_counts > 789].index

# ---- Filter dataset ----
df_filtered = df[df['DEST'].isin(valid_dests)].reset_index(drop=True)

# ---- Save filtered dataset ----
df_filtered.to_csv("clean_data_dest_filtered.csv", index=False)

print("✅ Filtered dataset saved as 'clean_data_dest_filtered.csv'")
print("Number of DEST kept:", len(valid_dests))
print("Shape after filtering:", df_filtered.shape)
print("DEST kept:\n", valid_dests.tolist())


✅ Filtered dataset saved as 'clean_data_dest_filtered.csv'
Number of DEST kept: 21
Shape after filtering: (32002, 16)
DEST kept:
 ['DEN', 'PHX', 'LAX', 'SEA', 'LAS', 'ANC', 'SLC', 'SFO', 'PDX', 'ORD', 'DFW', 'BOI', 'SAN', 'GEG', 'SJC', 'SMF', 'ATL', 'MSP', 'SNA', 'OAK', 'JFK']


In [None]:
# ==========================================
# One-Hot Encoding for categorical features:
# OP_UNIQUE_CARRIER, ORIGIN, DEST
# ==========================================

import pandas as pd

# Read your dataset
df = pd.read_csv("clean_data_dest_filtered.csv")

# ---- Define allowed categories ----
carrier_list = ['AA', 'AS', 'B6', 'DL', 'F9', 'G4', 'HA', 'MQ', 'NK', 'OO', 'UA', 'WN']
origin_list  = ['ALW', 'BLI', 'GEG', 'PAE', 'PSC', 'SEA']
dest_list    = ['JFK', 'OAK', 'SNA', 'MSP', 'ATL', 'SMF', 'SJC', 'GEG', 'SAN', 'BOI',
                'DFW', 'ORD', 'PDX', 'SFO', 'SLC', 'ANC', 'LAS', 'SEA', 'LAX', 'PHX', 'DEN']

# ---- Ensure columns only contain known categories ----
df = df[df['OP_UNIQUE_CARRIER'].isin(carrier_list)]
df = df[df['ORIGIN'].isin(origin_list)]
df = df[df['DEST'].isin(dest_list)]

# ---- One-Hot Encoding ----
df_encoded = pd.get_dummies(
    df,
    columns=['OP_UNIQUE_CARRIER', 'ORIGIN', 'DEST'],
    prefix=['CARRIER', 'ORIGIN', 'DEST'],
    dtype=int  # ensure 0/1 instead of True/False
)

# ---- Save encoded dataset ----
df_encoded.to_csv("clean_data_encoded_onehot.csv", index=False)

print("✅ One-Hot encoding completed successfully!")
print("New shape:", df_encoded.shape)
print("Encoded columns:")
print([col for col in df_encoded.columns if any(prefix in col for prefix in ['CARRIER_', 'ORIGIN_', 'DEST_'])])


✅ One-Hot encoding completed successfully!
New shape: (32002, 51)
Encoded columns:
['CARRIER_AA', 'CARRIER_AS', 'CARRIER_B6', 'CARRIER_DL', 'CARRIER_F9', 'CARRIER_G4', 'CARRIER_MQ', 'CARRIER_NK', 'CARRIER_OO', 'CARRIER_UA', 'CARRIER_WN', 'ORIGIN_ALW', 'ORIGIN_BLI', 'ORIGIN_GEG', 'ORIGIN_PAE', 'ORIGIN_PSC', 'ORIGIN_SEA', 'DEST_ANC', 'DEST_ATL', 'DEST_BOI', 'DEST_DEN', 'DEST_DFW', 'DEST_GEG', 'DEST_JFK', 'DEST_LAS', 'DEST_LAX', 'DEST_MSP', 'DEST_OAK', 'DEST_ORD', 'DEST_PDX', 'DEST_PHX', 'DEST_SAN', 'DEST_SEA', 'DEST_SFO', 'DEST_SJC', 'DEST_SLC', 'DEST_SMF', 'DEST_SNA']


In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

df = pd.read_csv("clean_data_encoded_onehot.csv")

scaler = StandardScaler()
numeric_cols = ['TAXI_OUT', 'CRS_ELAPSED_TIME', 'DISTANCE']

# Fit & transform
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print("✅ Scaled numeric features using StandardScaler")
df.to_csv("clean_data_scaled.csv", index=False)


✅ Scaled numeric features using StandardScaler


In [36]:
# ====================================================
# Remove all data where DEP_DELAY > 300
# ====================================================

import pandas as pd

# Load your dataset (replace with your current file name)
df = pd.read_csv("clean_data_scaled.csv")

# Display original shape
print("Original dataset shape:", df.shape)

# Filter out rows where DEP_DELAY > 300
filtered_df = df[df["DEP_DELAY"] <= 44]

# Display new shape after filtering
print("Filtered dataset shape:", filtered_df.shape)

# Save the cleaned dataset to a new file
filtered_df.to_csv("clean_data_scaled_new.csv", index=False)

print("✅ Data with DEP_DELAY > 300 removed and saved to 'clean_data_scaled_new.csv'")


Original dataset shape: (32002, 51)
Filtered dataset shape: (29537, 51)
✅ Data with DEP_DELAY > 300 removed and saved to 'clean_data_scaled_new.csv'
