In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.preprocessing import LabelEncoder

In [6]:

files = ["2018.csv", "2019.csv", "2020.csv"]



In [None]:
def process_file(file_path):
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Drop completely empty column if it exists
    if 'Unnamed: 27' in df.columns:
        df.drop(columns=['Unnamed: 27'], inplace=True)
    
    # Convert FL_DATE to datetime format
    df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
    
    # Fill missing numeric values with 0
    df.fillna(0, inplace=True)
    
    return df

# Process each file and store results
dataframes = [process_file(file) for file in files]

# Combine all years into a single DataFrame
df_combined = pd.concat(dataframes, ignore_index=True)

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()


In [None]:


# Convert categorical columns to numerical using Label Encoding
categorical_cols = ['OP_CARRIER', 'ORIGIN', 'DEST', 'CANCELLATION_CODE']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_combined[col] = le.fit_transform(df_combined[col].astype(str))
    label_encoders[col] = le

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()


In [None]:

# Create IS_DELAY column (1 if ARR_DELAY > 15, else 0)
df_combined['IS_DELAY'] = (df_combined['ARR_DELAY'] > 15).astype(int)

# Display the first few rows
print(df_combined.head())

# Check data info to confirm cleaning
df_combined.info()

In [12]:
# Count the number of delayed (1) and non-delayed (0) flights
delay_counts = df_combined['IS_DELAY'].value_counts()

# Print the counts
print(f"On-time flights (0): {delay_counts.get(0, 0)}")
print(f"Delayed flights (1): {delay_counts.get(1, 0)}")


On-time flights (0): 11987426
Delayed flights (1): 5195631
