In [6]:
# clean_flights.py

"""
Flight Data Cleaning Script
Author: Your Name
Description:
    - Cleans raw flight data.
    - Handles missing values.
    - Parses dates.
    - Creates useful features.
    - Saves cleaned dataset.
"""

import pandas as pd
import numpy as np
import os

# 1. Load the data
input_file =( r'C:\Users\Ahmed\Desktop\AIRLINE FILES\FLIGHTS.csv')
output_file = r'C:\Users\Ahmed\Desktop\AIRLINE FILES\cleaned_flight_data.csv'

if not os.path.exists(input_file):
    raise FileNotFoundError(f"Input file {input_file} not found!")

print("Loading raw data...")
df = pd.read_csv(input_file)
print(f"Initial data shape: {df.shape}")

# 2. Initial inspection
print("First few rows:")
print(df.head())

# 3. Cleaning steps

## 3.1 Drop rows with critical missing fields
critical_columns = ['flight_id', 'origin', 'destination']
df.dropna(subset=critical_columns, inplace=True)

## 3.2 Parse departure_time correctly
df['departure_time'] = pd.to_datetime(df['departure_time'], errors='coerce')
df.dropna(subset=['departure_time'], inplace=True)

## 3.3 Standardize text fields
for col in ['origin', 'destination']:
    df[col] = df[col].str.strip().str.upper()

df['flight_id'] = df['flight_id'].astype(str)

## 3.4 Feature engineering: Add year, month, day, weekday, hour
df['year'] = df['departure_time'].dt.year
df['month'] = df['departure_time'].dt.month
df['day'] = df['departure_time'].dt.day
df['weekday'] = df['departure_time'].dt.day_name()
df['hour'] = df['departure_time'].dt.hour

# 4. Remove duplicates
df.drop_duplicates(inplace=True)

# 5. Sort the data
df.sort_values(by='departure_time', inplace=True)

# 6. Save the cleaned data
print("Saving cleaned data...")
df.to_csv(output_file, index=False)
print(f"Cleaned data saved to {output_file}")
print(f"Final data shape: {df.shape}")





Loading raw data...
Initial data shape: (1020, 4)
First few rows:
   flight_id     origin destination  departure_time
0          1  Mogadishu     Kampala  5/1/2025 10:00
1          2    Kampala     Nairobi  5/1/2025 14:30
2          3    Nairobi   Mogadishu   5/2/2025 9:00
3          4    Kampala        Juba  5/3/2025 16:00
4          5       Juba     Nairobi  5/3/2025 18:00
Saving cleaned data...
Cleaned data saved to C:\Users\Ahmed\Desktop\AIRLINE FILES\cleaned_flight_data.csv
Final data shape: (1020, 9)
