In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
import joblib
import re

# Load CSV
df = pd.read_csv("flight_prices.csv")

# Convert ISO duration (e.g. PT10H30M) to float hours
def convert_duration(d):
    h = re.search(r'(\d+)H', d)
    m = re.search(r'(\d+)M', d)
    hours = int(h.group(1)) if h else 0
    mins = int(m.group(1)) if m else 0
    return round(hours + mins / 60, 2)

df["duration"] = df["duration"].apply(convert_duration)

# Convert stops to text
df["stops"] = df["stops"].map({0: "zero", 1: "one", 2: "two_or_more"}).fillna("two_or_more")

# Features
X = df[["origin", "destination", "flight_date", "class", "duration", "stops"]].copy()
X["days_left"] = (pd.to_datetime(df["flight_date"]) - pd.to_datetime(df["date_collected"])).dt.days
y = df["price_usd"]

# One-hot encode
X = pd.get_dummies(X)

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

# Save
joblib.dump(model, "backpacker_model.pkl")
joblib.dump(scaler, "backpacker_scaler.pkl")
joblib.dump(X.columns.tolist(), "backpacker_features.pkl")


MAE: 132.8290124999999
R²: 0.07126832507115433


['backpacker_features.pkl']