In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl

sns.set()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

In [2]:
df1 = pd.read_csv("flight_LAX_ATL_data_0601_0831.csv")
df2 = pd.read_csv("flight_LAX_CHI_data_0601_0831.csv")
df3 = pd.read_csv("flight_LAX_SFO_data_0601_0831.csv")
df4 = pd.read_csv("flight_LAX_JFK_data_0601_0831.csv")
df5 = pd.read_csv("flight_LAX_DFW_data_0601_0831.csv")
df6 = pd.read_csv("flight_LAX_HNL_data_0601_0831.csv")
df7 = pd.read_csv("flight_LAX_DEN_data_0601_0831.csv")

In [3]:
# convert duration to numerical format in minutes
def clean_duration(duration):
    durations = []
    for dur in duration:
        dur_str = str(dur)  # Convert duration to string
        dur_split = dur_str.split()
        hours = int(dur_split[0].split("h")[0]) if "h" in dur_split[0] else 0
        minutes = int(dur_split[1].split("m")[0]) if len(dur_split) > 1 and "m" in dur_split[1] else 0
        total_minutes = hours * 60 + minutes
        durations.append(total_minutes)
    return durations

# convert it to numerical
def clean_stops(stops):
    if stops == 'nonstop':
        return 0
    elif stops == '1 stop':
        return 1
    elif stops == '2 stops':
        return 2
    elif stops == '3 stops':
        return 3
    else:
        return np.nan

#split the date to day of week (0=Sunday,1=Monday, etc.) and month
def clean_date(date):
    date = pd.to_datetime(date)
    df['Date'] = pd.to_datetime(df['Date'])
    df['DayOfWeek'] = (df['Date'].dt.dayofweek + 1) % 7
    df['Month'] = df['Date'].dt.month
    return date

#remove uncessary space and punctuation
def clean_company_name(df):
    # Remove leading and trailing whitespace
    df['Company Name'] = df['Company Name'].str.strip()
    # Remove punctuation
    df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
    # Remove extra whitespace within the company name
    df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
    return df

#convert date and company name into numerical representations

def preprocess(df):
    df = clean_company_name(df)  # Clean company names first
    le = LabelEncoder()
    df['Date'] = le.fit_transform(df['Date'])
    df['Company Name'] = le.fit_transform(df['Company Name'])
    return df

def clean_destination(df):
    le = LabelEncoder()
    df['Destination'] = le.fit_transform(df['Destination'])
    return df

In [4]:
dfs=[df1,df2,df3,df4,df5,df6,df7]
flights = pd.concat(dfs)
flights

Unnamed: 0,Price,Company Name,Stops,Duration,Destination,From,Date
0,254,American Airlines,nonstop,4h 38m,ATL,LAX,6/1/23
1,73,Spirit Airlines,1 stop,25h 28m,ATL,LAX,6/1/23
2,209,American Airlines,1 stop,6h 15m,ATL,LAX,6/1/23
3,159,United Airlines,1 stop,6h 55m,ATL,LAX,6/1/23
4,204,United Airlines,1 stop,6h 10m,ATL,LAX,6/1/23
...,...,...,...,...,...,...,...
11966,1073,American Airlines,1 stop,8h 57m,DEN,LAX,2023-08-31
11967,240,"Allegiant Air, Frontier",2 stops,34h 48m,DEN,LAX,2023-08-31
11968,788,Alaska Airlines,1 stop,19h 18m,DEN,LAX,2023-08-31
11969,222,"Southern / Mokulele, Frontier",2 stops,41h 20m,DEN,LAX,2023-08-31


In [5]:
for df in dfs:
    df['Duration'] = clean_duration(df['Duration'])
    df['Stops'] = df['Stops'].apply(clean_stops).astype(float).fillna(-1).astype(int)
    df['Stops'] = df['Stops'].replace(-1, '')
    df = clean_company_name(df)
    df['Date'] = clean_date(df['Date'])
    df = preprocess(df)

  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')


In [6]:
flights = pd.concat(dfs)
le = LabelEncoder()
flights['Destination'] = le.fit_transform(flights['Destination']) 
flights = flights.drop('From', axis=1)
flights

Unnamed: 0,Price,Company Name,Stops,Duration,Destination,Date,DayOfWeek,Month
0,254,3,0,278,0,0,4,6
1,73,13,1,1528,0,0,4,6
2,209,3,1,375,0,0,4,6
3,159,18,1,415,0,0,4,6
4,204,18,1,370,0,0,4,6
...,...,...,...,...,...,...,...,...
11966,1073,3,1,537,1,91,4,8
11967,240,2,2,2088,1,91,4,8
11968,788,0,1,1158,1,91,4,8
11969,222,9,2,2480,1,91,4,8


In [7]:
flights = flights[flights['Stops'].str.strip().astype(bool)]
flights

Unnamed: 0,Price,Company Name,Stops,Duration,Destination,Date,DayOfWeek,Month
0,254,3,0,278,0,0,4,6
1,73,13,1,1528,0,0,4,6
2,209,3,1,375,0,0,4,6
3,159,18,1,415,0,0,4,6
4,204,18,1,370,0,0,4,6
...,...,...,...,...,...,...,...,...
11966,1073,3,1,537,1,91,4,8
11967,240,2,2,2088,1,91,4,8
11968,788,0,1,1158,1,91,4,8
11969,222,9,2,2480,1,91,4,8


In [8]:
X = flights[['Company Name', 'Stops', 'Duration', 'Destination','Date','DayOfWeek', 'Month']]
X = np.array(X)
y = flights['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

In [9]:
X_train_val = np.concatenate([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])

rf2 = RandomForestRegressor()
rf2.fit(X_train_val, y_train_val)

print("Random Forest")
print(f'Train score {rf2.score(X_train_val, y_train_val)}')
print(f'Val score {rf2.score(X_val, y_val)}')
print(f'Test score {rf2.score(X_test, y_test)}')
print("MAE:" , metrics.mean_absolute_error(y_test,rf2.predict(X_test)))
print("MSE:" , metrics.mean_squared_error(y_test,rf2.predict(X_test)))
print("RMSE:" , np.sqrt(metrics.mean_squared_error(y_test,rf2.predict(X_test))))
print("R-squared:", metrics.r2_score(y_test, rf2.predict(X_test)))
print("Explained variance score:", metrics.explained_variance_score(y_test, rf2.predict(X_test)))

Random Forest
Train score 0.9468216674215333
Val score 0.9414896536792225
Test score 0.9466986216814559
MAE: 25.883998181148836
MSE: 3002.9732800608467
RMSE: 54.79939123804978
R-squared: 0.9466986216814559
Explained variance score: 0.9466987333731206


In [10]:
y_train_val_pred = rf2.predict(X_train_val)
y_test_pred = rf2.predict(X_test)

In [13]:
with open("model.pkl", "wb") as file:
    pkl.dump(rf2, file)