In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
DATA_PATH = "/content/philippine-air-carriers-flight-records.csv"

df = pd.read_csv(DATA_PATH)

print("Dataset shape:", df.shape)
df.head()

In [None]:
print("\nColumn Information:")
df.info()

print("\nSummary Statistics (Numerical):")
df.describe()

In [None]:
missing_df = df.isnull().sum().sort_values(ascending=False)
missing_percent = (missing_df / len(df)) * 100

missing_summary = pd.DataFrame({
    "missing_count": missing_df,
    "missing_percent": missing_percent
})

missing_summary

plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Value Heatmap")
plt.show()


In [None]:
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

In [None]:
for col in df.columns:
    if 'date' in col.lower() or 'time' in col.lower():
        try:
            df[col] = pd.to_datetime(df[col])
        except Exception:
            pass

# Convert object columns with low cardinality to category
for col in df.select_dtypes(include='object').columns:
    if df[col].nunique() < 50:
        df[col] = df[col].astype('category')


In [None]:
# Numerical distributions
num_cols = df.select_dtypes(include=[np.number]).columns

for col in num_cols:
    plt.figure()
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()

# Categorical distributions
cat_cols = df.select_dtypes(include=['category']).columns

for col in cat_cols:
    plt.figure(figsize=(10,4))
    df[col].value_counts().head(20).plot(kind='bar')
    plt.title(f"Top Categories in {col}")
    plt.show()

In [None]:
outlier_summary = {}

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
    outlier_summary[col] = len(outliers)

pd.DataFrame.from_dict(outlier_summary, orient='index', columns=['outlier_count'])


In [None]:
print("Cleaned dataset preview:")
df.head()

print("\nFinal dataset shape:", df.shape)

In [None]:
OUTPUT_PATH = "/content/processed_flight_records.csv"
df.to_csv(OUTPUT_PATH, index=False)

print(f"Processed dataset saved to {OUTPUT_PATH}")