# Analyzing Key Operational Metrics in the Airline Industry


In [None]:
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('flights_sample_3m.csv')

# Display initial rows 
data.head(5)


In [None]:
data.info()

In [None]:
# List unique airlines
print("\nUnique Airlines in Dataset:")
print(data['AIRLINE'].unique())

In [None]:
# Calculate average delay for each airline
avg_delay_by_airline = data.groupby('AIRLINE')[['DEP_DELAY', 'ARR_DELAY']].mean().sort_values(by='DEP_DELAY', ascending=False)
print(avg_delay_by_airline)

In [None]:
# Calculate average delay for each airport
avg_delay_by_airport = data.groupby('ORIGIN')[['DEP_DELAY', 'ARR_DELAY']].mean().sort_values(by='DEP_DELAY', ascending=False)
print(avg_delay_by_airport)

In [None]:
print("Unique values in AIRLINE_CODE column:")
print(data['AIRLINE_CODE'].unique())

In [None]:
# Then filter based on AIRLINE_CODE:
filtered_data = data[data['AIRLINE_CODE'] == 'DL']
print(f"\nFiltered Data for Delta Airlines: {filtered_data.shape[0]} rows")


In [None]:
# Focus on relevant columns
selected_columns = ['FL_DATE', 'ORIGIN', 'DEST', 'DEP_DELAY', 'ARR_DELAY', 'CANCELLED', 'CANCELLATION_CODE', 
                    'DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT']
filtered_data = filtered_data[selected_columns]

print("\nSelected Columns Overview:\n", filtered_data.info())


In [None]:
filtered_data.isnull().sum()

In [None]:
# Verify the results
print("\nRemaining Missing Values:\n", filtered_data.isnull().sum())


In [None]:
# Handle delay columns based on cancellation status: If the flight is canceled, delay values are set to NaN (irrelevant)
filtered_data.loc[filtered_data['CANCELLED'] == 1, ['DEP_DELAY', 'ARR_DELAY']] = 0

# For non-canceled flights, fill missing delay values with the median
delay_columns = ['DEP_DELAY', 'ARR_DELAY']
for col in delay_columns:
    filtered_data.loc[filtered_data['CANCELLED'] == 0, col] = filtered_data.loc[filtered_data['CANCELLED'] == 0, col].fillna(filtered_data[col].median())

# Fill missing values in delay cause columns with 0
delay_cause_columns = ['DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 
                       'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT']
filtered_data[delay_cause_columns] = filtered_data[delay_cause_columns].fillna(0)

# Fill missing values in CANCELLATION_CODE with 'No Cancellation'
filtered_data['CANCELLATION_CODE'].fillna('No Cancellation', inplace=True)

# Verify missing values after handling
filtered_data.isnull().sum()

In [None]:
filtered_data.head(5)

In [None]:
# Data Summary
print("\nSummary Statistics for Filtered Data:")
print(filtered_data.describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [None]:
# Calculate average delay for each airport based on filtered data
avg_delay_by_airport = filtered_data.groupby('ORIGIN')[['DEP_DELAY', 'ARR_DELAY']].mean().sort_values(by='DEP_DELAY', ascending=False)
print(avg_delay_by_airport)

In [None]:

# Plot top 10 airports with highest average departure delay
top_10_airports = avg_delay_by_airport.head(10)

plt.figure(figsize=(8, 4))
sns.barplot(x=top_10_airports['DEP_DELAY'], y=top_10_airports.index, palette='viridis')
plt.title('Top 10 Airports by Average Departure Delay (Filtered for Delta Airlines)', fontsize=16)
plt.xlabel('Average Departure Delay (minutes)', fontsize=12)
plt.ylabel('Airport Code (ORIGIN)', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()


In [None]:
# Plot bottom 10 airports with Lowest average departure delay
bottom_10_airports = avg_delay_by_airport.tail(10)

plt.figure(figsize=(8, 4))
sns.barplot(x=bottom_10_airports['DEP_DELAY'], y=bottom_10_airports.index, palette='viridis')
plt.title('Bottom 10 Airports by Average Departure Delay (Filtered for Delta Airlines)', fontsize=16)
plt.xlabel('Average Departure Delay (minutes)', fontsize=12)
plt.ylabel('Airport Code (ORIGIN)', fontsize=12)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()


In [None]:
# Calculate monthly average delay
filtered_data['FL_DATE'] = pd.to_datetime(filtered_data['FL_DATE'])
filtered_data['Month'] = filtered_data['FL_DATE'].dt.month

monthly_avg_delay = filtered_data.groupby('Month')['DEP_DELAY'].mean()

# Plot monthly average delay
plt.figure(figsize=(6, 4))
sns.lineplot(x=monthly_avg_delay.index, y=monthly_avg_delay.values, marker='o')
plt.title('Monthly Average Departure Delay (Filtered for Delta Airlines)', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Average Departure Delay (minutes)', fontsize=12)
plt.xticks(range(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
plt.grid(True)
plt.show()


In [None]:
# Sum delay causes
delay_causes = filtered_data[['DELAY_DUE_CARRIER', 'DELAY_DUE_WEATHER', 'DELAY_DUE_NAS', 'DELAY_DUE_SECURITY', 'DELAY_DUE_LATE_AIRCRAFT']].sum()

# Plot delay causes
plt.figure(figsize=(6, 4))
sns.barplot(x=delay_causes.index, y=delay_causes.values, palette='magma')
plt.title('Contribution of Delay Causes (Filtered for Delta Airlines)', fontsize=16)
plt.xlabel('Delay Cause', fontsize=12)
plt.ylabel('Total Delay (minutes)', fontsize=12)
plt.xticks(rotation=45)
plt.show()


In [None]:
# Count cancellations by month
monthly_cancellations = filtered_data[filtered_data['CANCELLED'] == 1].groupby('Month')['CANCELLED'].count()

# Plot cancellations by month
plt.figure(figsize=(6, 4))
sns.barplot(x=monthly_cancellations.index, y=monthly_cancellations.values, palette='Blues_d')
plt.title('Monthly Flight Cancellations (Filtered for Delta Airlines)', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Cancellations', fontsize=12)
plt.xticks(range(1, 13), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
plt.show()


In [None]:
# Histogram of departure delays for Delta Airlines
plt.figure(figsize=(6, 4))
sns.histplot(filtered_data['DEP_DELAY'], kde=True, bins=50, color='blue')
plt.title('Distribution of Departure Delays (Delta Airlines)', fontsize=16)
plt.xlabel('Departure Delay (minutes)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

In [None]:
# Histogram of arrival delays for Delta Airlines
plt.figure(figsize=(8, 6))
sns.histplot(filtered_data['ARR_DELAY'], kde=True, bins=30, color='orange')
plt.title('Distribution of Arrival Delays (Delta Airlines)', fontsize=16)
plt.xlabel('Arrival Delay (minutes)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()


In [None]:
# Count of cancellations by month for Delta Airlines
monthly_cancellations = filtered_data[filtered_data['CANCELLED'] == 1].groupby('Month').size()

plt.figure(figsize=(6, 4))
sns.barplot(x=monthly_cancellations.index, y=monthly_cancellations.values, palette='coolwarm')
plt.title('Number of Cancellations by Month (Delta Airlines)', fontsize=16)
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Cancellations', fontsize=12)
plt.show()


In [None]:
# Correlation heatmap for delay causes for Delta Airlines
plt.figure(figsize=(6, 4))
corr = filtered_data[delay_cause_columns].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap of Delay Causes (Delta Airlines)', fontsize=16)
plt.show()


In [None]:
# Scatter plot for departure vs. arrival delays
fig = px.scatter(filtered_data, x='DEP_DELAY', y='ARR_DELAY', 
                 title='Departure Delay vs. Arrival Delay',
                 labels={'DEP_DELAY': 'Departure Delay (minutes)', 'ARR_DELAY': 'Arrival Delay (minutes)'},
                 color='CANCELLED')
fig.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Prepare data for modeling
X = filtered_data[['DEP_DELAY']]
y = filtered_data['ARR_DELAY']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
# Predictions
y_pred = model.predict(X_test)

# Model evaluation
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared:", r2_score(y_test, y_pred))


In [None]:

# Plot regression line
plt.figure(figsize=(10, 6))
sns.regplot(x=y_test, y=y_pred, ci=None, line_kws={"color": "red"})
plt.title('Actual vs Predicted Arrival Delays', fontsize=16)
plt.xlabel('Actual Arrival Delay (minutes)', fontsize=12)
plt.ylabel('Predicted Arrival Delay (minutes)', fontsize=12)
plt.show()
