In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.preprocessing import LabelEncoder

In [6]:

files = ["2018.csv", "2019.csv", "2020.csv"]



In [None]:
def process_file(file_path):
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Drop completely empty column if it exists
    if 'Unnamed: 27' in df.columns:
        df.drop(columns=['Unnamed: 27'], inplace=True)
    
    # Convert FL_DATE to datetime format
    df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
    
    # Fill missing numeric values with 0
    df.fillna(0, inplace=True)
    
    return df

# Process each file and store results
dataframes = [process_file(file) for file in files]

# Combine all years into a single DataFrame
df_combined = pd.concat(dataframes, ignore_index=True)

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()


In [None]:


# Convert categorical columns to numerical using Label Encoding
categorical_cols = ['OP_CARRIER', 'ORIGIN', 'DEST', 'CANCELLATION_CODE']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_combined[col] = le.fit_transform(df_combined[col].astype(str))
    label_encoders[col] = le

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()


In [10]:
df_combined['ARR_DELAY'] = pd.to_numeric(df_combined['ARR_DELAY'], errors='coerce').fillna(0)

# Create IS_DELAY column (1 if ARR_DELAY > 15, else 0)
df_combined['IS_DELAY'] = (df_combined['ARR_DELAY'] > 15).astype(int)

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()

     FL_DATE  OP_CARRIER  OP_CARRIER_FL_NUM  ORIGIN  DEST  CRS_DEP_TIME  \
0 2018-01-01          13               2429     119    96        1517.0   
1 2018-01-01          13               2427     192   316        1115.0   
2 2018-01-01          13               2426     330    96        1335.0   
3 2018-01-01          13               2425     302   253        1546.0   
4 2018-01-01          13               2424     253    14         630.0   

   DEP_TIME  DEP_DELAY  TAXI_OUT  WHEELS_OFF  ...  AIR_TIME  DISTANCE  \
0    1512.0       -5.0      15.0      1527.0  ...     225.0    1605.0   
1    1107.0       -8.0      11.0      1118.0  ...      65.0     414.0   
2    1330.0       -5.0      15.0      1345.0  ...     106.0     846.0   
3    1552.0        6.0      19.0      1611.0  ...     157.0    1120.0   
4     650.0       20.0      13.0       703.0  ...      83.0     723.0   

   CARRIER_DELAY  WEATHER_DELAY  NAS_DELAY  SECURITY_DELAY  \
0            0.0            0.0        0.0      