In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl

sns.set()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

In [2]:
flights = pd.read_csv('flights.csv')

In [3]:
import preprocessing

In [4]:
flights

Unnamed: 0,Price,Company Name,Stops,Duration,Destination,From,Date
0,254,American Airlines,nonstop,4h 38m,ATL,LAX,6/1/23
1,73,Spirit Airlines,1 stop,25h 28m,ATL,LAX,6/1/23
2,209,American Airlines,1 stop,6h 15m,ATL,LAX,6/1/23
3,159,United Airlines,1 stop,6h 55m,ATL,LAX,6/1/23
4,204,United Airlines,1 stop,6h 10m,ATL,LAX,6/1/23
...,...,...,...,...,...,...,...
158633,982,American Airlines,1 stop,21h 55m,SFO,LAX,8/31/23
158634,712,"Spirit Airlines, Sun Country Air",2 stops,31h 15m,SFO,LAX,8/31/23
158635,702,"Spirit Airlines, Sun Country Air",2 stops,32h 27m,SFO,LAX,8/31/23
158636,737,"Spirit Airlines, Sun Country Air",2 stops,32h 14m,SFO,LAX,8/31/23


In [5]:
flights['Duration'] = preprocessing.clean_duration(flights['Duration'])
flights['Stops'] = flights['Stops'].apply(preprocessing.clean_stops).astype(float).fillna(-1).astype(int)
flights['Stops'] = flights['Stops'].replace(-1, '')
flights = preprocessing.clean_company_name(flights)
flights = preprocessing.clean_date(flights)
flights = preprocessing.preprocess(flights)

  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')
  df['Company Name'] = df['Company Name'].str.replace('[^\w\s]', '')
  df['Company Name'] = df['Company Name'].str.replace('\s+', ' ')


In [6]:
le = LabelEncoder()
flights['Destination'] = le.fit_transform(flights['Destination']) 
flights = flights.drop('From', axis=1)
flights

Unnamed: 0,Price,Company Name,Stops,Duration,Destination,Date,DayOfWeek,Month
0,254,7,0,278,0,0,4,6
1,73,24,1,1528,0,0,4,6
2,209,7,1,375,0,0,4,6
3,159,44,1,415,0,0,4,6
4,204,44,1,370,0,0,4,6
...,...,...,...,...,...,...,...,...
158633,982,7,1,1315,6,91,4,8
158634,712,34,2,1875,6,91,4,8
158635,702,34,2,1947,6,91,4,8
158636,737,34,2,1934,6,91,4,8


In [7]:
flights = flights[flights['Stops'].str.strip().astype(bool)]
flights

Unnamed: 0,Price,Company Name,Stops,Duration,Destination,Date,DayOfWeek,Month
0,254,7,0,278,0,0,4,6
1,73,24,1,1528,0,0,4,6
2,209,7,1,375,0,0,4,6
3,159,44,1,415,0,0,4,6
4,204,44,1,370,0,0,4,6
...,...,...,...,...,...,...,...,...
158633,982,7,1,1315,6,91,4,8
158634,712,34,2,1875,6,91,4,8
158635,702,34,2,1947,6,91,4,8
158636,737,34,2,1934,6,91,4,8


In [8]:
X = flights[['Company Name', 'Stops', 'Duration', 'Destination','Date','DayOfWeek', 'Month']]
X = np.array(X)
y = flights['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [9]:
X_train_val = np.concatenate([X_train, X_val])
y_train_val = np.concatenate([y_train, y_val])

rf2 = RandomForestRegressor()
rf2.fit(X_train_val, y_train_val)

print("Random Forest")
print(f'Train score {rf2.score(X_train_val, y_train_val)}')
print(f'Val score {rf2.score(X_val, y_val)}')
print(f'Test score {rf2.score(X_test, y_test)}')
print("MAE:" , metrics.mean_absolute_error(y_test,rf2.predict(X_test)))
print("MSE:" , metrics.mean_squared_error(y_test,rf2.predict(X_test)))
print("RMSE:" , np.sqrt(metrics.mean_squared_error(y_test,rf2.predict(X_test))))
print("R-squared:", metrics.r2_score(y_test, rf2.predict(X_test)))
print("Explained variance score:", metrics.explained_variance_score(y_test, rf2.predict(X_test)))

Random Forest
Train score 0.9468882470611122
Val score 0.9426926558890034
Test score 0.70776453680818
MAE: 63.75882001808205
MSE: 16151.038061679663
RMSE: 127.08673440481371
R-squared: 0.70776453680818
Explained variance score: 0.707764908521413


In [10]:
y_train_val_pred = rf2.predict(X_train_val)
y_test_pred = rf2.predict(X_test)

In [10]:
with open("model.pkl", "wb") as file:
    pkl.dump(rf2, file)