# Web Scraping

In [1]:
from time import sleep
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup
import os
from selenium.webdriver.support.ui import WebDriverWait
import requests
from selenium.common.exceptions import NoSuchElementException
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from selenium.common.exceptions import TimeoutException
import datetime
from datetime import datetime, timedelta
from tqdm import tqdm

In [2]:
def scrape_kayak(from_location, to_location, date):

    formatted_date = date.strftime('%Y-%m-%d')
    url = f'https://www.kayak.com/flights/{from_location}-{to_location}/{formatted_date}?sort=bestflight_a'
    driver.get(url)

    
    wait_time = 20  # increase wait time if needed
    wait = WebDriverWait(driver, wait_time)

    while True:
        try:
            show_more_buttons = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.ULvh-button.show-more-button')))
            num_buttons = len(show_more_buttons)

            for i in range(num_buttons):
                show_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '.ULvh-button.show-more-button')))
                driver.execute_script("arguments[0].click();", show_more_button)
                try:
                    wait.until(EC.staleness_of(show_more_button))
                except:
                    pass

            flight_rows = driver.find_elements(By.XPATH, '//div[@class="nrc6-inner"]')

        except TimeoutException:
            break
    
    return flight_rows

In [3]:
def convert_to_df(flight_rows):
    
    list_prices = []
    list_company_names=[]
    list_stops=[]
    list_duration=[]
    list_dates=[]
    
    for WebElement in flight_rows:
        elementHTML = WebElement.get_attribute('outerHTML')
        elementSoup = BeautifulSoup(elementHTML, 'html.parser')
        # price
        temp_price = elementSoup.find("div", {"class": "nrc6-price-section"})
        price = temp_price.find("div", {"class": "f8F1-price-text"})
        list_prices.append(price.text)

        #company names
        company_names = elementSoup.find("div",{"class":"c_cgF c_cgF-mod-variant-default"}).text
        list_company_names.append(company_names)

        # stops
        temp_stops = elementSoup.find("div", {"class": "vmXl vmXl-mod-variant-default"})
        stops = temp_stops.find("span", {"class": "JWEO-stops-text"})
        list_stops.append(stops.text)

        #durations  
        temp_durations = elementSoup.find("div", {"class": "xdW8"})
        durations = temp_durations.find("div", {"class": "vmXl vmXl-mod-variant-default"}).text
        list_duration.append(durations)
    
    df = pd.DataFrame({
        'Price': list_prices,
        'Company Name': list_company_names,
        'Stops': list_stops,
        'Duration': list_duration,
        })

    df['Destination'] = to_location
    df['From'] = from_location
    df['Date'] = date
    
    return df

In [None]:
driver = webdriver.Chrome()

from_location = 'LAX'
to_location = 'SFO'

date = datetime.strptime('2023-06-27', '%Y-%m-%d')
end_date = datetime.strptime('2023-06-28', '%Y-%m-%d')
total_days = (end_date - date).days + 1

df = pd.DataFrame()
with tqdm(total=total_days) as pbar:

    while date <= end_date:

        df = pd.concat([df, convert_to_df(scrape_kayak(from_location, to_location, date))])
        
        date += timedelta(days=1)
        pbar.update(1)
df

In [None]:
df.to_csv('flight_LAX_SFO_0627_0628_data.csv', index=False)

# 2. Exploratory Data Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
import re
import tensorflow as tf
from tensorflow.keras import layers
import plotly.graph_objects as go
sns.set()
import plotly.express as px


from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
from sklearn.inspection import permutation_importance

2023-05-31 14:29:06.586992: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Explore Data

In [2]:
df1 = pd.read_csv("flight_LAX_ATL_data_0601_0831.csv")
df2 = pd.read_csv("flight_LAX_CHI_data_0601_0831.csv")
df3 = pd.read_csv("flight_LAX_SFO_data_0601_0831.csv")
df4 = pd.read_csv("flight_LAX_DEN_data_0601_0831.csv")
df5 = pd.read_csv("flight_LAX_JFK_data_0601_0831.csv")
df6 = pd.read_csv("flight_LAX_HNL_data_0601_0831.csv")
df7 = pd.read_csv("flight_LAX_DFW_data_0601_0831.csv")

df1,df2,df3,df4,df5,df6,df7

(       Price       Company Name    Stops Duration Destination From     Date
 0        254  American Airlines  nonstop   4h 38m         ATL  LAX   6/1/23
 1         73    Spirit Airlines   1 stop  25h 28m         ATL  LAX   6/1/23
 2        209  American Airlines   1 stop   6h 15m         ATL  LAX   6/1/23
 3        159    United Airlines   1 stop   6h 55m         ATL  LAX   6/1/23
 4        204    United Airlines   1 stop   6h 10m         ATL  LAX   6/1/23
 ...      ...                ...      ...      ...         ...  ...      ...
 34809   1174   Avianca, JetBlue  3 stops  32h 56m         ATL  LAX  8/31/23
 34810   1174   Avianca, JetBlue  3 stops  37h 39m         ATL  LAX  8/31/23
 34811   1201   Avianca, JetBlue  3 stops  37h 39m         ATL  LAX  8/31/23
 34812   1174   Avianca, JetBlue  3 stops  38h 34m         ATL  LAX  8/31/23
 34813   1201   Avianca, JetBlue  3 stops  38h 34m         ATL  LAX  8/31/23
 
 [34814 rows x 7 columns],
        Price       Company Name    Stops Durat

# Data Preprocessing

In [3]:
#remove based on the interquartile range method (IQR)
#remove very high or very low values based on quantile
#def remove_outliers(df):
#    Q1 = df['Price'].quantile(0.25)
#    Q3 = df['Price'].quantile(0.75)
#    IQR = Q3 - Q1
#    lower_lim = Q1 - 1.5 * IQR
#    upper_lim = Q3 + 1.5 * IQR
#    dropped_values = df[(df['Price'] < lower_lim) | (df['Price'] > upper_lim)]   
#    df = df[(df['Price'] >= lower_lim) & (df['Price'] <= upper_lim)]
#    return df

# convert duration to numerical format in minutes
def clean_duration(duration):
    durations = []
    for dur in duration:
        dur_split = dur.split()
        hours = int(dur_split[0].split("h")[0]) if "h" in dur_split[0] else 0
        minutes = int(dur_split[1].split("m")[0]) if "m" in dur_split[1] else 0
        total_minutes = hours * 60 + minutes
        durations.append(total_minutes)
    return durations

# convert it to numerical
def clean_stops(stops):
    if stops == 'nonstop':
        return 0
    elif stops == '1 stop':
        return 1
    elif stops == '2 stops':
        return 2
    elif stops == '3 stops':
        return 3
    else:
        return np.nan

#split the date to day of week (0=Sunday,1=Monday, etc.) and month
def clean_date(date):
    date = pd.to_datetime(date)
    df['Date'] = pd.to_datetime(df['Date'])
    df['DayOfWeek'] = (df['Date'].dt.dayofweek + 1) % 7
    df['Month'] = df['Date'].dt.month
    return date

#remove uncessary space and punctuation
def clean_company_name(df):
    # Remove leading and trailing whitespace
    df['Company Name'] = df['Company Name'].str.strip()
    # Remove punctuation
    df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
    # Remove extra whitespace within the company name
    df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
    return df

#convert date and company name into numerical representations

def preprocess(df):
    df = clean_company_name(df)  # Clean company names first
    le = LabelEncoder()
    df['Date'] = le.fit_transform(df['Date'])
    df['Company Name'] = le.fit_transform(df['Company Name'])
    return df

#def get_avg_per_airline(x):
#   Average for trips with single airlines
#   single_airlines = x[~x["Company Name"].str.contains(",")]
#   avg_per_airline = single_airlines.groupby("Company Name", as_index=False)["Price"].mean()
#   avg_per_airline = avg_per_airline.rename(columns={"Price": "Average Price"})
#    temp = x.merge(avg_per_airline, on="Company Name", how="left")
    
    # Average for trips with multiple airlines
#    multiple_airlines = x[x["Company Name"].str.contains(",")]
#    avg_price_multiple = multiple_airlines["Price"].mean()
#    multiple_airlines = multiple_airlines.reset_index(drop=True)  # Reset index
#    temp.loc[multiple_airlines.index, "Average Price"] = avg_price_multiple
    
#    return temp

In [4]:
dfs=[df1,df2,df3,df4,df5,df6,df7]

In [5]:
for df in dfs:
    df['Duration'] = clean_duration(df['Duration'])
    df['Stops'] = df['Stops'].apply(clean_stops).astype(float).fillna(-1).astype(int)
    df['Stops'] = df['Stops'].replace(-1, '')
    df = clean_company_name(df)
    df['Date'] = clean_date(df['Date'])
    df = preprocess(df)

dfs

  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
  df['Company Name'] = df['Company Name'].str.replace('

[       Price  Company Name  Stops  Duration Destination From  Date  DayOfWeek  \
 0        254             3      0       278         ATL  LAX     0          4   
 1         73            13      1      1528         ATL  LAX     0          4   
 2        209             3      1       375         ATL  LAX     0          4   
 3        159            18      1       415         ATL  LAX     0          4   
 4        204            18      1       370         ATL  LAX     0          4   
 ...      ...           ...    ...       ...         ...  ...   ...        ...   
 34809   1174             5      3      1976         ATL  LAX    90          4   
 34810   1174             5      3      2259         ATL  LAX    90          4   
 34811   1201             5      3      2259         ATL  LAX    90          4   
 34812   1174             5      3      2314         ATL  LAX    90          4   
 34813   1201             5      3      2314         ATL  LAX    90          4   
 
        Month 

In [10]:
flights = pd.concat(dfs)
flights

Unnamed: 0,Price,Company Name,Stops,Duration,Destination,From,Date,DayOfWeek,Month
0,254,3,0,278,ATL,LAX,0,4,6
1,73,13,1,1528,ATL,LAX,0,4,6
2,209,3,1,375,ATL,LAX,0,4,6
3,159,18,1,415,ATL,LAX,0,4,6
4,204,18,1,370,ATL,LAX,0,4,6
...,...,...,...,...,...,...,...,...,...
11971,254,7,1,402,DFW,LAX,91,4,8
11972,155,20,1,467,DFW,LAX,91,4,8
11973,198,20,1,462,DFW,LAX,91,4,8
11974,314,7,1,380,DFW,LAX,91,4,8


In [11]:
has_missing_values = flights.isnull().any().any()
print(has_missing_values)

False


In [12]:
string_columns = flights.select_dtypes(include=[object]).columns
print(string_columns)

Index(['Stops', 'Destination', 'From'], dtype='object')


In [13]:
flights = flights[flights['Stops'].str.strip().astype(bool)]
destination = flights[['Destination']]
destination = pd.get_dummies(destination, prefix='', prefix_sep='')
flights = pd.concat([flights, destination], axis=1)
flights = flights.drop(["Destination", "From",], axis=1)
flights

Unnamed: 0,Price,Company Name,Stops,Duration,Date,DayOfWeek,Month,ATL,DEN,DFW,HNL,JFK,ORD,SFO
0,254,3,0,278,0,4,6,1,0,0,0,0,0,0
1,73,13,1,1528,0,4,6,1,0,0,0,0,0,0
2,209,3,1,375,0,4,6,1,0,0,0,0,0,0
3,159,18,1,415,0,4,6,1,0,0,0,0,0,0
4,204,18,1,370,0,4,6,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11971,254,7,1,402,91,4,8,0,0,1,0,0,0,0
11972,155,20,1,467,91,4,8,0,0,1,0,0,0,0
11973,198,20,1,462,91,4,8,0,0,1,0,0,0,0
11974,314,7,1,380,91,4,8,0,0,1,0,0,0,0


# Visualizations

#### Average Price by Airlines

In [None]:
import plotly.express as px

# Calculate average price and duration by company
average_price_duration = flights.groupby('Company Name').agg({'Price': 'mean', 'Duration': 'mean'}).reset_index()

# Sort the data by average price in ascending order
average_price_duration = average_price_duration.sort_values(by='Price', ascending=True)

# Create the bar plot using Plotly
fig = px.bar(average_price_duration, x='Company Name', y='Price', color='Duration',
             title='Average Price by Company Name', labels={'Price': 'Average Price', 'Duration': 'Average Duration'})
fig.update_layout(xaxis_tickangle=-45, title_x=0.5)
fig.show()


The higher the duration, the higher the corresponding price experienced.

#### Price Distribution by Month

In [None]:
fig = px.box(flights, x='Month', 
             y='Price', 
             color='Month', 
             title='Price Distribution by Month')
fig.update_layout(xaxis={'categoryorder':'total descending'}, title_x=0.5)
fig.show()

The price is typically lower in August, then June. Higher prices in July.

In [None]:
import plotly.express as px

fig = px.box(flights, x='DayOfWeek', y='Price', color='DayOfWeek', title='Price Distribution by Day of Week')

fig.update_layout(
    title='Price Distribution by Day of Week',
    yaxis_title='Price',  # Set y-axis title
    xaxis_title='Day of Week',  # Set x-axis title
    showlegend=False)

fig.show()


#### Look closer with day of week split into Month

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Group the data by month
grouped_data = flights.groupby('Month')

# Create subplots with shared y-axis
fig = make_subplots(rows=3, cols=1, shared_yaxes=True)

row = 1
for month, data in grouped_data:
    # Iterate over each day of the week
    for day in data['DayOfWeek'].unique():
        # Convert day to string
        day_str = str(day)
        # Create Violin trace for the day
        trace = go.Violin(
            x=data[data['DayOfWeek'] == day]['DayOfWeek'],
            y=data[data['DayOfWeek'] == day]['Price'],
            name=day_str,
            box_visible=True,
            meanline_visible=True
        )
        fig.add_trace(trace, row=row, col=1)
    # Update x-axis title with month
    fig.update_xaxes(title='Day - Month {}'.format(month), row=row, col=1)
    row += 1

# Update y-axis title for all subplots
fig.update_yaxes(title='Price', col=1)

# Update overall layout
fig.update_layout(height=900, 
                  width=600, 
                  showlegend=False, 
                  title='Price Distribution by Day of the Week')

# Show the plot
fig.show()


Prices are typically lower during Monday to Wednesay, then increase from Thursday to Sunday.

In [None]:
import plotly.graph_objects as go
import plotly.figure_factory as ff
import numpy as np

# Select the variables to include in the heatmap
selected_variables = ['Price', 'Company Name', 'Stops', 'Duration', 'DayOfWeek', 'Month', 'ATL', 'DEN', 'DFW', 'HNL', 'JFK', 'ORD', 'SFO']

# Calculate the correlation matrix for the selected variables
correlation_matrix = flights[selected_variables].corr()

# Round the correlation values to two decimal places
correlation_matrix = np.round(correlation_matrix, decimals=2)


# Create the heatmap
fig = ff.create_annotated_heatmap(
    z=correlation_matrix.values,
    x=list(correlation_matrix.columns),
    y=list(correlation_matrix.index),
    showscale=True,
    zmin=-1,
    zmax=1,
    annotation_text=correlation_matrix.values,
    text=correlation_matrix.values,
    hoverinfo='text'
)

# Set the title
fig.update_layout(title='Correlation Heatmap')

# Move x-axis labels to the bottom
fig.update_layout(xaxis=dict(side='bottom'), title_x=0.5)

# Show the plot
fig.show()


# 3 Building Models

## Feature Importance

In [14]:
# Fit the ExtraTreesRegressor model
X_features = flights[['Company Name', 'Stops', 'Duration', 'Date','DayOfWeek', 'Month', 'ATL', 'DEN', 'DFW', 'HNL', 'JFK', 'ORD', 'SFO']]
X_features = np.array(X_features)
y = flights['Price']
selection =ExtraTreesRegressor()
selection.fit(X_features,y)
importances = permutation_importance(selection, X_features, y).importances_mean
importances

array([1.05010047, 0.41133012, 1.2046276 , 0.43998788, 0.33015194,
       0.37570533, 0.11051443, 0.02664814, 0.09563612, 0.89078259,
       0.17823166, 0.03923762, 0.04909862])

In [15]:
X_df = pd.DataFrame(X_features, columns=('Company Name', 'Stops', 'Duration', 'Date', 'DayOfWeek', 'Month', 'ATL', 'DEN', 'DFW', 'HNL', 'JFK', 'ORD', 'SFO'))

# Sort feature importances in descending order
sorted_indices = importances.argsort()[::-1]
sorted_importances = importances[sorted_indices]
sorted_features = X_df.columns[sorted_indices]

# Plotting graph of important features
fig = px.bar(
    x=sorted_importances,
    y=sorted_features,
    orientation='h'
)

fig.update_layout(
    title='Important Features',
    xaxis_title='Importance',
    yaxis_title='Features',
    xaxis=dict(side='bottom'),
    yaxis=dict(autorange="reversed")
)

fig.show()


In [16]:
# Drop the less important features
important_features = sorted_features[:7]  # Select the top 7 features
X_selected = X_df[important_features]
X_selected

Unnamed: 0,Duration,Company Name,HNL,Date,Stops,Month,DayOfWeek
0,278,3,0,0,0,6,4
1,1528,13,0,0,1,6,4
2,375,3,0,0,1,6,4
3,415,18,0,0,1,6,4
4,370,18,0,0,1,6,4
...,...,...,...,...,...,...,...
158519,402,7,0,91,1,8,4
158520,467,20,0,91,1,8,4
158521,462,20,0,91,1,8,4
158522,380,7,0,91,1,8,4


# Machine Leaning

In [17]:
X = flights[['Company Name', 'Stops', 'Duration', 'Date','DayOfWeek', 'Month']]
X = np.array(X)
y = flights['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [18]:
X

array([[3, 0, 278, 0, 4, 6],
       [13, 1, 1528, 0, 4, 6],
       [3, 1, 375, 0, 4, 6],
       ...,
       [20, 1, 462, 91, 4, 8],
       [7, 1, 380, 91, 4, 8],
       [20, 1, 485, 91, 4, 8]], dtype=object)

In [19]:
X_train.shape, y_train.shape

((126819, 6), (126819,))

In [20]:
def get_metrics(model):
    print(f'Train score {model.score(X_train, y_train)}')
    print(f'Val score {model.score(X_val, y_val)}')
    print("MAE:", metrics.mean_absolute_error(y_val, model.predict(X_val)))
    print("MSE:", metrics.mean_squared_error(y_val, model.predict(X_val)))
    print("RMSE:", np.sqrt(metrics.mean_squared_error(y_val, model.predict(X_val))))
    print("R-squared:", r2_score(y_val, model.predict(X_val)))
    print("Explained variance score:", explained_variance_score(y_val, model.predict(X_val)))

In [21]:
print("Random Forest:")
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
get_metrics(rf)

Random Forest:
Train score 0.9227052961979445
Val score 0.5716753965809591
MAE: 75.5541912553924
MSE: 22281.0276447203
RMSE: 149.26830756969244
R-squared: 0.5716753965809591
Explained variance score: 0.5717740704882736


In [51]:
print("Linear Regression:")
lr = LinearRegression()
lr.fit(X_train, y_train)
score = lr.score(X_val, y_val)
get_metrics(lr)

Linear Regression:
Train score 0.06555922856731378
Val score 0.06282171611193088
MAE: 144.28099246965476
MSE: 50742.77383636635
RMSE: 225.26156759724094
R-squared: 0.06282171611193088
Explained variance score: 0.06282270817624147


In [52]:
print("K-Nearest Neighbors:")
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
get_metrics(knn)


K-Nearest Neighbors:
Train score 0.47788811023268063
Val score 0.17252195168684747
MAE: 126.50492035956474
MSE: 44803.141709509546
RMSE: 211.66752634617703
R-squared: 0.17252195168684747
Explained variance score: 0.17270928724611578


In [53]:
#Decision Tree
print("Decision Tree:")
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
get_metrics(dt)

Decision Tree:
Train score 0.9739924645548973
Val score 0.30214975636008157
MAE: 88.1910798613458
MSE: 37784.54718110262
RMSE: 194.38247652785634
R-squared: 0.30214975636008157
Explained variance score: 0.3021581427965254


In [54]:
print("Gradient Boosting Regressor:")
gradient_boosting = GradientBoostingRegressor()
gradient_boosting.fit(X_train, y_train)
get_metrics(gradient_boosting)

Gradient Boosting Regressor:
Train score 0.2610571099360526
Val score 0.2606349742341193
MAE: 119.10307621832361
MSE: 40032.3320866002
RMSE: 200.08081388928875
R-squared: 0.2606349742341193
Explained variance score: 0.2606489642608968


In [None]:
print("Neural Network:")
neural_network = MLPRegressor()
neural_network.fit(X_train, y_train)
get_metrics(neural_network)

# Pick Random Forest and Decision Tree for Testing

## Random Forest

In [30]:
X_train_val = np.concatenate([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])

rf2 = RandomForestRegressor()
rf2.fit(X_train_val, y_train_val)

print("Random Forest")
print(f'Train score {rf2.score(X_train_val, y_train_val)}')
print(f'Val score {rf2.score(X_val, y_val)}')
print(f'Test score {rf2.score(X_test, y_test)}')
print("MAE:" , metrics.mean_absolute_error(y_test,rf2.predict(X_test)))
print("MSE:" , metrics.mean_squared_error(y_test,rf2.predict(X_test)))
print("RMSE:" , np.sqrt(metrics.mean_squared_error(y_test,rf2.predict(X_test))))
print("R-squared:", metrics.r2_score(y_test, rf2.predict(X_test)))
print("Explained variance score:", metrics.explained_variance_score(y_test, rf2.predict(X_test)))

Random Forest
Train score 0.9221188397283522
Val score 0.9155901405077875
Test score 0.924464679300678
MAE: 30.696683772530708
MSE: 4107.163825565567
RMSE: 64.08715803938857
R-squared: 0.924464679300678
Explained variance score: 0.9244657221557157


In [23]:
y_train_val_pred = rf2.predict(X_train_val)
y_test_pred = rf2.predict(X_test)

In [24]:
test_df = pd.DataFrame({
    "Predicted Price" :rf2.predict(X_test),
    "Actual Price" : y_test,
}).reset_index(drop = True)

test_df

Unnamed: 0,Predicted Price,Actual Price
0,221.2165,244
1,610.3690,586
2,266.1100,199
3,283.1700,263
4,285.5900,294
...,...,...
31700,417.2000,403
31701,626.8200,629
31702,396.4200,411
31703,193.8600,193


In [25]:
fig = px.scatter(
    flights,
    x=y_test_pred,
    y=y_test,
    opacity=0.7,
    color_discrete_sequence=['purple']
)

fig.update_layout(
    title='Actual vs. Predicted Airline Prices',
    xaxis_title='Predicted Airline Prices',
    title_x=0.5,
    yaxis_title='Actual Airline Prices'
)

fig.show()

## Decision Tree

In [31]:
dt2=DecisionTreeRegressor()
dt2.fit(X_train_val, y_train_val)
print("Decision Tree")
print(f'Train score {dt2.score(X_train_val, y_train_val)}')
print(f'Val score {rf2.score(X_val, y_val)}')
print(f'Test score {dt2.score(X_test, y_test)}')
print("MAE:" , metrics.mean_absolute_error(y_test,dt2.predict(X_test)))
print("MSE:" , metrics.mean_squared_error(y_test,dt2.predict(X_test)))
print("RMSE:" , np.sqrt(metrics.mean_squared_error(y_test,dt2.predict(X_test))))
print("R-squared:", metrics.r2_score(y_test, dt2.predict(X_test)))
print("Explained variance score:", metrics.explained_variance_score(y_test, dt2.predict(X_test)))

Decision Tree
Train score 0.9641966282704227
Val score 0.9155901405077875
Test score 0.9695146070735076
MAE: 9.198415220072294
MSE: 1657.6152967464132
RMSE: 40.7138219373521
R-squared: 0.9695146070735076
Explained variance score: 0.9695157291569133


In [27]:
y_train_val_pred = dt2.predict(X_train_val)
y_test_pred = dt2.predict(X_test)

In [28]:
test_df = pd.DataFrame({
    "Predicted Price" : dt2.predict(X_test),
    "Actual Price" : y_test,
}).reset_index(drop = True)

test_df

Unnamed: 0,Predicted Price,Actual Price
0,224.5,244
1,586.0,586
2,199.0,199
3,263.0,263
4,294.0,294
...,...,...
31700,403.0,403
31701,629.0,629
31702,411.0,411
31703,193.0,193


In [29]:
fig = px.scatter(
    flights,
    x=y_test_pred,
    y=y_test,
    opacity=0.7,
    color_discrete_sequence=['purple']
)

fig.update_layout(
    title='Actual vs. Predicted Airline Prices',
    xaxis_title='Predicted Airline Prices',
    title_x=0.5,
    yaxis_title='Actual Airline Prices'
)

fig.show()


# Hyperparameter Tuning

In [35]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the random forest
    'max_depth': [None, 5, 10],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Create the model
model = RandomForestRegressor()

# Create RandomizedSearchCV instance
random_search = RandomizedSearchCV(model, param_grid, cv=3, scoring='r2', n_iter=10) 

# Fit the data to perform random search
random_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Evaluate the model with the best hyperparameters
print("Best Hyperparameters:", best_params)
print("Best R-squared Score:", random_search.best_score_)
print("Test R-squared Score:", best_model.score(X_test, y_test))


Best Hyperparameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}
Best R-squared Score: 0.557795816828435
Test R-squared Score: 0.8041427107352647


In [36]:
model.fit(X_train, y_train)
predicted_price = model.predict(X_test)
predicted_price

array([243.31      , 362.41466667, 284.22      , ..., 195.36      ,
       578.65792857, 753.97032143])

In [37]:
y_train_val_pred2 = model.predict(X_train_val)
y_test_pred2 = model.predict(X_test)

In [38]:
test_df = pd.DataFrame({
    "Predicted Price" : model.predict(X_test),
    "Actual Price" : y_test,
}).reset_index(drop = True)

test_df

Unnamed: 0,Predicted Price,Actual Price
0,243.310000,220
1,362.414667,376
2,284.220000,283
3,327.270000,303
4,290.065833,264
...,...,...
31700,576.731393,588
31701,481.200000,587
31702,195.360000,182
31703,578.657929,778


In [39]:
# Scatter plot
fig = px.scatter(
    x=y_test_pred2,
    y=y_test,
    opacity=0.7,
    color_discrete_sequence=['purple'],
)

# Set plot title and axis labels
fig.update_layout(
    title='Actual vs. Predicted Airline Prices',
    xaxis_title='Predicted Airline Prices',
    title_x=0.5,
    yaxis_title='Actual Airline Prices'
)

# Show the plot
fig.show()

In [40]:
print("MAE:" , metrics.mean_absolute_error(y_test,predicted_price))
print("MSE:" , metrics.mean_squared_error(y_test,predicted_price))
print("RMSE:" , np.sqrt(metrics.mean_squared_error(y_test,predicted_price)))

MAE: 39.7751390435401
MSE: 8086.621830470356
RMSE: 89.92564612206218


Worse than before

# Cross Validation

In [72]:
from sklearn.model_selection import cross_val_score
# Perform k-fold cross-validation
scores = cross_val_score(rf2, X, y, cv=5, scoring='r2')

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)
print("Mean R-squared:", scores.mean())

Cross-Validation Scores: [-0.27934215 -0.22262707 -1.64315278 -3.01958378 -0.23644482]
Mean R-squared: -1.0802301196505106


In [73]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

scoring = {'r2': make_scorer(r2_score), 'mse': 'neg_mean_squared_error'}

cv_results = cross_validate(dt2, X, y, cv=5, scoring=scoring)
scores_r2 = cv_results['test_r2']
scores_mse = cv_results['test_mse']

mean_r2 = np.mean(scores_r2)
mean_mse = -np.mean(scores_mse)

print("Cross-Validation R-squared Scores:", scores_r2)
print("Mean R-squared:", mean_r2)
print("Cross-Validation MSE Scores:", scores_mse)
print("Mean MSE:", mean_mse)


Cross-Validation R-squared Scores: [-0.98841223 -0.83633631 -2.73217012 -4.50496763 -0.38980802]
Mean R-squared: -1.8903388601715672
Cross-Validation MSE Scores: [ -76693.82809827  -55143.06179381 -103841.1454776  -394989.33516759
 -143072.49947432]
Mean MSE: 154747.97400231738


# Saving model

In [None]:
import pickle

# Assuming your trained model instance is named 'model'
# Save the model as a pickle file
with open('rf_flight_prediction.pkl', 'wb') as file:
    pickle.dump(rf2, file)


In [None]:
model = open('rf_flight_prediction.pkl','rb')
rf_flight_prediction = pkl.load(model)

In [None]:
print(f'R2 score {metrics.r2_score(y_test,rf_flight_prediction.predict(X_test))}')
print("MAE:" , metrics.mean_absolute_error(y_test,rf_flight_prediction.predict(X_test)))
print("MSE:" , metrics.mean_squared_error(y_test,rf_flight_prediction.predict(X_test)))
print("RMSE:" , np.sqrt(metrics.mean_squared_error(y_test, rf_flight_prediction.predict(X_test))))

# Python Script

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# User input
date = input("Enter the date (YYYY-MM-DD): ")
company_name = input("Enter the company name: ")
num_stops = int(input("Enter the number of stops: "))
duration = int(input("Enter the duration in minutes: "))
day_of_week = int(input("Enter the day of the week (0-6): "))
month = int(input("Enter the month (1-12): "))

# Convert the date string to a datetime object
date_obj = datetime.strptime(date, "%Y-%m-%d")

# Extract features from the date object
year = date_obj.year
weekday = date_obj.weekday()

# Create a DataFrame with the preprocessed input data
data = pd.DataFrame({
    'Year': [year],
    'Weekday': [weekday],
    'Number of Stops': [num_stops],
    'Duration': [duration],
    'Day of Week': [day_of_week],
    'Month': [month]
})

# Convert categorical features to numerical using one-hot encoding
categorical_cols = ['Company Name']
data_encoded = pd.get_dummies(data.drop(columns=categorical_cols), columns=categorical_cols)

# Get the list of feature names
feature_names = data_encoded.columns.tolist()

# Reorder columns to match the model's input
data_ordered = data_encoded[feature_names]

# Make predictions
predicted_price = model.predict(data_ordered)

# Display the predicted price
print("Predicted price:", predicted_price)
