# Flight Price Prediction - Final Workflow

This notebook demonstrates the complete pipeline for predicting flight prices using machine learning.

## Dataset
The dataset contains flight booking information from Indian cities with features like airline, source, destination, departure/arrival times, duration, and price.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

## 2. Load Data

In [None]:
df_train = pd.read_excel("Data_Train.xlsx")
print(f"Dataset shape: {df_train.shape}")
df_train.head()

## 3. Data Cleaning

In [None]:
# Check for missing values
print("Missing values:")
print(df_train.isnull().sum())

# Remove rows with missing values
df_train = df_train.dropna()
print(f"\nShape after removing nulls: {df_train.shape}")

# Check for duplicates
print(f"\nDuplicate rows: {df_train.duplicated().sum()}")

# Remove duplicates
df_train = df_train.drop_duplicates()
print(f"Shape after removing duplicates: {df_train.shape}")

In [None]:
# Create working copy
data = df_train.copy()

## 4. Feature Engineering

### 4.1 Date Features

In [None]:
# Extract date features
data["Date_of_Journey"] = pd.to_datetime(data["Date_of_Journey"], format="%d/%m/%Y")
data["Journey_day"] = data["Date_of_Journey"].dt.day
data["Journey_month"] = data["Date_of_Journey"].dt.month
data["Journey_year"] = data["Date_of_Journey"].dt.year

print("Date features created:")
data[["Date_of_Journey", "Journey_day", "Journey_month", "Journey_year"]].head()

### 4.2 Time Features

In [None]:
# Extract time features
data["Arrival_hour"] = pd.to_datetime(data["Arrival_Time"]).dt.hour
data["Arrival_min"] = pd.to_datetime(data["Arrival_Time"]).dt.minute
data["Departure_hour"] = pd.to_datetime(data["Dep_Time"]).dt.hour
data["Departure_min"] = pd.to_datetime(data["Dep_Time"]).dt.minute

print("Time features created:")
data[["Dep_Time", "Departure_hour", "Departure_min", "Arrival_Time", "Arrival_hour", "Arrival_min"]].head()

### 4.3 Duration Features

In [None]:
# Process duration
def preprocess_duration(x):
    if "h" not in x:
        x = "0h " + x
    elif "m" not in x:
        x = x + " 0m"
    return x

data["Duration"] = data["Duration"].apply(preprocess_duration)
data["Duration_hour"] = pd.to_timedelta(data["Duration"]).dt.components.hours
data["Duration_minutes"] = pd.to_timedelta(data["Duration"]).dt.components.minutes
data["Total_min"] = data["Duration_hour"] * 60 + data["Duration_minutes"]

print("Duration features created:")
data[["Duration", "Duration_hour", "Duration_minutes", "Total_min"]].head()

### 4.4 Price Conversion to USD

In [None]:
# Convert price to USD
def convert_to_usd(x):
    return int(x * 0.0142)

data["Price_USD"] = data["Price"].apply(convert_to_usd)
print(f"Price range in USD: ${data['Price_USD'].min()} - ${data['Price_USD'].max()}")

### 4.5 Process Total Stops

In [None]:
# Extract number of stops
def extract_stops(x):
    match = re.search(r'\d+', str(x))
    return int(match.group()) if match else 0

data["Total_Stops"] = data["Total_Stops"].apply(extract_stops)
print("Stops distribution:")
print(data["Total_Stops"].value_counts().sort_index())

### 4.6 Clean Destination Names

In [None]:
# Standardize destination names
data["Destination"].replace("New Delhi", "Delhi", inplace=True)
print("Unique destinations:")
print(data["Destination"].unique())

### 4.7 Drop Unnecessary Columns

In [None]:
# Drop columns we don't need
cols_to_drop = ["Dep_Time", "Arrival_Time", "Duration", "Date_of_Journey", 
                "Price", "Additional_Info", "Route", "Journey_year", 
                "Duration_hour", "Duration_minutes"]
data.drop(cols_to_drop, axis=1, inplace=True)
print(f"Remaining columns: {list(data.columns)}")

## 5. Encoding Categorical Variables

### 5.1 One-Hot Encode Source

In [None]:
# One-hot encode source city
source_dummies = pd.get_dummies(data["Source"], prefix="Source").astype(int)
data = pd.concat([data, source_dummies], axis=1)
data.drop("Source", axis=1, inplace=True)
print(f"Source columns created: {list(source_dummies.columns)}")

### 5.2 Label Encode Airline

In [None]:
# Encode airlines by average price
airlines = data.groupby(["Airline"])["Price_USD"].mean().sort_values().index
airlines_dict = dict(zip(airlines, range(len(airlines))))
data["Airline"] = data["Airline"].map(airlines_dict)
print("Airline encoding:")
for airline, code in airlines_dict.items():
    print(f"  {airline}: {code}")

### 5.3 Label Encode Destination

In [None]:
# Encode destinations by average price
destinations = data.groupby(["Destination"])["Price_USD"].mean().sort_values().index
dest_dict = dict(zip(destinations, range(len(destinations))))
data["Destination"] = data["Destination"].map(dest_dict)
print("Destination encoding:")
for dest, code in dest_dict.items():
    print(f"  {dest}: {code}")

### 5.4 Remove Duplicate Columns

In [None]:
# Remove duplicate columns if any
data = data.loc[:, ~data.columns.duplicated()]
print(f"Final shape: {data.shape}")
print(f"Final columns: {list(data.columns)}")

## 6. Outlier Detection and Removal

In [None]:
# Visualize price distribution before outlier removal
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
sns.boxplot(x=data["Price_USD"])
plt.title("Price Distribution - Before Outlier Removal")

# Calculate IQR
q1 = data["Price_USD"].quantile(0.25)
q3 = data["Price_USD"].quantile(0.75)
iqr = q3 - q1
maximum = q3 + 1.5 * iqr
minimum = q1 - 1.5 * iqr

print(f"IQR Method:")
print(f"  Q1: ${q1:.2f}")
print(f"  Q3: ${q3:.2f}")
print(f"  IQR: ${iqr:.2f}")
print(f"  Lower bound: ${minimum:.2f}")
print(f"  Upper bound: ${maximum:.2f}")

# Count outliers
outliers = len(data[(data["Price_USD"] < minimum) | (data["Price_USD"] > maximum)])
print(f"\nOutliers found: {outliers} ({outliers/len(data)*100:.2f}%)")

# Remove outliers
data = data[(data["Price_USD"] >= minimum) & (data["Price_USD"] <= maximum)]
print(f"Shape after removing outliers: {data.shape}")

# Visualize after
plt.subplot(1, 2, 2)
sns.boxplot(x=data["Price_USD"])
plt.title("Price Distribution - After Outlier Removal")
plt.tight_layout()
plt.show()

## 7. Prepare Data for Modeling

In [None]:
# Split features and target
X = data.drop(["Price_USD"], axis=1)
y = data["Price_USD"]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## 8. Model Training and Evaluation

### 8.1 Random Forest Regressor

In [None]:
# Train Random Forest
rfr_model = RandomForestRegressor(n_estimators=100, random_state=42)
rfr_model.fit(X_train, y_train)

# Predictions
y_pred_rfr = rfr_model.predict(X_test)

# Evaluation
print("Random Forest Regressor Performance:")
print(f"  Training Score: {rfr_model.score(X_train, y_train):.4f}")
print(f"  Testing Score (R²): {metrics.r2_score(y_test, y_pred_rfr):.4f}")
print(f"  MAE: ${metrics.mean_absolute_error(y_test, y_pred_rfr):.2f}")
print(f"  MSE: ${metrics.mean_squared_error(y_test, y_pred_rfr):.2f}")
print(f"  RMSE: ${np.sqrt(metrics.mean_squared_error(y_test, y_pred_rfr)):.2f}")
print(f"  MAPE: {metrics.mean_absolute_percentage_error(y_test, y_pred_rfr):.4f}")

### 8.2 Decision Tree Regressor

In [None]:
# Train Decision Tree
dtr_model = DecisionTreeRegressor(random_state=42)
dtr_model.fit(X_train, y_train)

# Predictions
y_pred_dtr = dtr_model.predict(X_test)

# Evaluation
print("Decision Tree Regressor Performance:")
print(f"  Training Score: {dtr_model.score(X_train, y_train):.4f}")
print(f"  Testing Score (R²): {metrics.r2_score(y_test, y_pred_dtr):.4f}")
print(f"  MAE: ${metrics.mean_absolute_error(y_test, y_pred_dtr):.2f}")
print(f"  MSE: ${metrics.mean_squared_error(y_test, y_pred_dtr):.2f}")
print(f"  RMSE: ${np.sqrt(metrics.mean_squared_error(y_test, y_pred_dtr)):.2f}")
print(f"  MAPE: {metrics.mean_absolute_percentage_error(y_test, y_pred_dtr):.4f}")

### 8.3 Feature Importance (Random Forest)

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rfr_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

# Visualize
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x="Importance", y="Feature")
plt.title("Top 10 Most Important Features")
plt.tight_layout()
plt.show()

### 8.4 Prediction vs Actual Comparison

In [None]:
# Visualize predictions
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_rfr, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Price (USD)")
plt.ylabel("Predicted Price (USD)")
plt.title("Random Forest: Actual vs Predicted")

plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred_dtr, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel("Actual Price (USD)")
plt.ylabel("Predicted Price (USD)")
plt.title("Decision Tree: Actual vs Predicted")

plt.tight_layout()
plt.show()

## 9. Save Models

In [None]:
# Save Random Forest model
with open("rfr.pkl", "wb") as file:
    pickle.dump(rfr_model, file)
print("Random Forest model saved as rfr.pkl")

# Save Decision Tree model
with open("dtr.pkl", "wb") as file:
    pickle.dump(dtr_model, file)
print("Decision Tree model saved as dtr.pkl")

## 10. Model Validation - Load and Test

In [None]:
# Load saved model and verify
with open("rfr.pkl", "rb") as file:
    loaded_model = pickle.load(file)

# Test loaded model
y_pred_loaded = loaded_model.predict(X_test)
r2_loaded = metrics.r2_score(y_test, y_pred_loaded)

print(f"Loaded model R² score: {r2_loaded:.4f}")
print("Model successfully saved and loaded!")

## Summary

This notebook demonstrates a complete machine learning pipeline:

1. **Data Loading**: Loaded flight booking data
2. **Data Cleaning**: Handled missing values and duplicates
3. **Feature Engineering**: 
   - Extracted date/time features
   - Processed duration
   - Converted price to USD
   - Extracted number of stops
4. **Encoding**: 
   - One-hot encoded source cities
   - Label encoded airlines and destinations by price
5. **Outlier Removal**: Used IQR method to remove price outliers
6. **Model Training**: Trained Random Forest and Decision Tree models
7. **Evaluation**: Compared model performance using multiple metrics
8. **Model Persistence**: Saved models for deployment

The models are ready to be used in a Flask web application for real-time flight price predictions.