In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import GridSearchCV
from joblib import dump, load
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_csv('../data_folder/processed_data.csv')
df.head()

Unnamed: 0,Journey_day,Airline,Flight_code,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare
0,Monday,SpiceJet,SG-8169,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5335
1,Monday,Indigo,6E-2519,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.3333,1,5899
2,Monday,GO FIRST,G8-354,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.1667,1,5801
3,Monday,SpiceJet,SG-8709,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5794
4,Monday,Air India,AI-805,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.1667,1,5955


In [4]:
# Dropping the Flight_code for now
# There are so many flight codes. Dropping it may or may note negatively impact the performance, but we'll see.
df = df.drop("Flight_code", axis=1)
df.head()

Unnamed: 0,Journey_day,Airline,Class,Source,Departure,Total_stops,Arrival,Destination,Duration_in_hours,Days_left,Fare
0,Monday,SpiceJet,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5335
1,Monday,Indigo,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.3333,1,5899
2,Monday,GO FIRST,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai,2.1667,1,5801
3,Monday,SpiceJet,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.0833,1,5794
4,Monday,Air India,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai,2.1667,1,5955


In [5]:
# Separating the numerical columns first
numeric_cols = df.select_dtypes(include="number")
# Separaating the object/string columns <- These will be encoded as these are categorical
object_cols = df.select_dtypes(include="object")

In [6]:
numeric_cols.head()

Unnamed: 0,Duration_in_hours,Days_left,Fare
0,2.0833,1,5335
1,2.3333,1,5899
2,2.1667,1,5801
3,2.0833,1,5794
4,2.1667,1,5955


In [7]:
object_cols.head()

Unnamed: 0,Journey_day,Airline,Class,Source,Departure,Total_stops,Arrival,Destination
0,Monday,SpiceJet,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai
1,Monday,Indigo,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai
2,Monday,GO FIRST,Economy,Delhi,After 6 PM,non-stop,Before 6 AM,Mumbai
3,Monday,SpiceJet,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai
4,Monday,Air India,Economy,Delhi,After 6 PM,non-stop,After 6 PM,Mumbai


In [8]:
# One hot encoding the object columns
obj_encoded = pd.get_dummies(data=object_cols, dtype=int, drop_first=True)
# Drop the first columns of the encoded dataframe to avoid inverted duplicates.
obj_encoded.head()

Unnamed: 0,Journey_day_Monday,Journey_day_Saturday,Journey_day_Sunday,Journey_day_Thursday,Journey_day_Tuesday,Journey_day_Wednesday,Airline_AirAsia,Airline_AkasaAir,Airline_AllianceAir,Airline_GO FIRST,...,Total_stops_non-stop,Arrival_6 AM - 12 PM,Arrival_After 6 PM,Arrival_Before 6 AM,Destination_Bangalore,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai
0,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
2,1,0,0,0,0,0,0,0,0,1,...,1,0,0,1,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1


In [9]:
# Concatenating encoded and numeric df
concat_df = pd.concat(objs=[numeric_cols, obj_encoded], axis=1) # colums with the columns
concat_df.head()

Unnamed: 0,Duration_in_hours,Days_left,Fare,Journey_day_Monday,Journey_day_Saturday,Journey_day_Sunday,Journey_day_Thursday,Journey_day_Tuesday,Journey_day_Wednesday,Airline_AirAsia,...,Total_stops_non-stop,Arrival_6 AM - 12 PM,Arrival_After 6 PM,Arrival_Before 6 AM,Destination_Bangalore,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai
0,2.0833,1,5335,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
1,2.3333,1,5899,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
2,2.1667,1,5801,1,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
3,2.0833,1,5794,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
4,2.1667,1,5955,1,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1


In [10]:
# Correlation of column fare with other columns
concat_df.corr()["Fare"]

Duration_in_hours         0.193277
Days_left                -0.087484
Fare                      1.000000
Journey_day_Monday        0.005998
Journey_day_Saturday      0.002550
Journey_day_Sunday        0.004333
Journey_day_Thursday     -0.012220
Journey_day_Tuesday      -0.003464
Journey_day_Wednesday     0.006274
Airline_AirAsia          -0.148664
Airline_AkasaAir         -0.067306
Airline_AllianceAir      -0.027457
Airline_GO FIRST         -0.115136
Airline_Indigo           -0.300160
Airline_SpiceJet         -0.083874
Airline_StarAir          -0.007595
Airline_Vistara           0.225099
Class_Economy            -0.702140
Class_First               0.033343
Class_Premium Economy    -0.142196
Source_Bangalore          0.006399
Source_Chennai            0.004621
Source_Delhi             -0.055638
Source_Hyderabad         -0.015988
Source_Kolkata            0.051180
Source_Mumbai             0.023188
Departure_6 AM - 12 PM    0.035424
Departure_After 6 PM      0.028727
Departure_Before 6 A

In [11]:
# Get label and features
y = concat_df["Fare"]
X = concat_df.drop("Fare", axis=1)

# Performing a train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
len(X_train), len(X_test)

(311756, 133610)

In [12]:
X_train_numeric = X_train[["Duration_in_hours", "Days_left"]]
X_train_numeric.head()


Unnamed: 0,Duration_in_hours,Days_left
192651,12.5,40
192308,12.25,39
97586,2.3333,47
46344,24.6667,37
175876,11.0,39


In [15]:
scaler = StandardScaler()
scaled_X_train_numeric = scaler.fit_transform(X_train_numeric)
scaled_X_train_numeric = pd.DataFrame(data=scaled_X_train_numeric, 
                                      columns=X_train_numeric.columns,
                                      index=X_train_numeric.index)
# Preserve the index and column while scaling
scaled_X_train_numeric.sample(10)

Unnamed: 0,Duration_in_hours,Days_left
124139,-0.971629,0.726879
22191,-0.960215,-1.091611
136199,-0.720705,0.37717
420520,0.761971,-0.532075
14156,0.294352,0.866763
195205,-1.427834,1.566182
44673,0.271551,0.37717
413235,0.396999,-0.252308
217399,0.054842,0.866763
365894,0.636509,0.167344


In [16]:
one_hot_X_train = X_train.drop(labels=X_train_numeric.columns, axis=1)
one_hot_X_train.head()

Unnamed: 0,Journey_day_Monday,Journey_day_Saturday,Journey_day_Sunday,Journey_day_Thursday,Journey_day_Tuesday,Journey_day_Wednesday,Airline_AirAsia,Airline_AkasaAir,Airline_AllianceAir,Airline_GO FIRST,...,Total_stops_non-stop,Arrival_6 AM - 12 PM,Arrival_After 6 PM,Arrival_Before 6 AM,Destination_Bangalore,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai
192651,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
192308,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
97586,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
46344,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
175876,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [17]:
# Repeating the same steps for X_test
X_test_numeric = X_test[["Duration_in_hours", "Days_left"]]
one_hot_X_test = X_test.drop(labels=X_train_numeric.columns, axis=1)
X_test_numeric

Unnamed: 0,Duration_in_hours,Days_left
227718,1.0000,48
358306,9.8333,48
72455,13.1667,47
93485,8.4167,32
442352,11.6667,29
...,...,...
29667,8.9167,31
75854,15.5833,15
228555,22.5833,4
434224,6.7500,14


In [18]:
scaled_X_test_numeric = scaler.transform(X_test_numeric)
scaled_X_test_numeric = pd.DataFrame(data=scaled_X_test_numeric, 
                                     columns=X_test_numeric.columns,
                                     index=X_test.index)
# Preserve the index and column while scaling
scaled_X_test_numeric.sample(10)

Unnamed: 0,Duration_in_hours,Days_left
207293,-0.561041,1.426299
418406,1.389255,-1.44132
182355,-0.74352,-1.161553
401548,1.548933,-1.511262
84182,-0.218884,-1.44132
403789,1.503316,-0.602017
119612,1.720011,-0.392192
102092,-0.98303,-1.021669
5223,-0.675089,-0.811843
64655,-1.290971,-0.462133


In [19]:
# Final scaled X_train and X_test 
X_train_f = pd.concat(objs=[scaled_X_train_numeric, one_hot_X_train], axis=1)
X_test_f = pd.concat(objs=[scaled_X_test_numeric, one_hot_X_test], axis=1)
X_train_f.shape, X_test_f.shape

((311756, 39), (133610, 39))

In [18]:
X_train_f.head()

Unnamed: 0,Duration_in_hours,Days_left,Journey_day_Monday,Journey_day_Saturday,Journey_day_Sunday,Journey_day_Thursday,Journey_day_Tuesday,Journey_day_Wednesday,Airline_AirAsia,Airline_AkasaAir,...,Total_stops_non-stop,Arrival_6 AM - 12 PM,Arrival_After 6 PM,Arrival_Before 6 AM,Destination_Bangalore,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai
192651,0.043441,1.006647,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
192308,0.009226,0.936705,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
97586,-1.348002,1.496241,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
46344,1.708611,0.796821,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
175876,-0.161853,0.936705,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [19]:
X_test_f.head()

Unnamed: 0,Duration_in_hours,Days_left,Journey_day_Monday,Journey_day_Saturday,Journey_day_Sunday,Journey_day_Thursday,Journey_day_Tuesday,Journey_day_Wednesday,Airline_AirAsia,Airline_AkasaAir,...,Total_stops_non-stop,Arrival_6 AM - 12 PM,Arrival_After 6 PM,Arrival_Before 6 AM,Destination_Bangalore,Destination_Chennai,Destination_Delhi,Destination_Hyderabad,Destination_Kolkata,Destination_Mumbai
227718,-1.530481,1.566182,0,1,0,0,0,0,0,1,...,1,0,1,0,0,1,0,0,0,0
358306,-0.321531,1.566182,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
72455,0.134688,1.496241,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
93485,-0.515411,0.447112,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
442352,-0.070606,0.237286,1,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [20]:
# All the columns we have in the new dataset
X_train_f.columns

Index(['Duration_in_hours', 'Days_left', 'Journey_day_Monday',
       'Journey_day_Saturday', 'Journey_day_Sunday', 'Journey_day_Thursday',
       'Journey_day_Tuesday', 'Journey_day_Wednesday', 'Airline_AirAsia',
       'Airline_AkasaAir', 'Airline_AllianceAir', 'Airline_GO FIRST',
       'Airline_Indigo', 'Airline_SpiceJet', 'Airline_StarAir',
       'Airline_Vistara', 'Class_Economy', 'Class_First',
       'Class_Premium Economy', 'Source_Bangalore', 'Source_Chennai',
       'Source_Delhi', 'Source_Hyderabad', 'Source_Kolkata', 'Source_Mumbai',
       'Departure_6 AM - 12 PM', 'Departure_After 6 PM',
       'Departure_Before 6 AM', 'Total_stops_2+-stop', 'Total_stops_non-stop',
       'Arrival_6 AM - 12 PM', 'Arrival_After 6 PM', 'Arrival_Before 6 AM',
       'Destination_Bangalore', 'Destination_Chennai', 'Destination_Delhi',
       'Destination_Hyderabad', 'Destination_Kolkata', 'Destination_Mumbai'],
      dtype='object')

## Modeling
The data is ready for training. Here are the steps we followed to prepare the dataset. 
1. Perform train-test split
2. Then separate numeric and one hot encodded columns from both the X_train and X_test
3. Then scale the numeric part using StandardScaler.
4. Do not scale the one hot encoded columns, those values are already either 0 or 1.
5. Then concatenate the scaled numeric part, and the corresponding one hot encoded columns.

Now we will train three models, linear, knn, and svm.
First we will train the base model (completely default params) and check the r2score.

#### Training Linear regression model

In [20]:
lin_model = LinearRegression()
lin_model.fit(X_train_f, y_train)
lin_pred = lin_model.predict(X_test_f)

lin_r2 = r2_score(y_true=y_test, y_pred=lin_pred)
print(f"R2 score of linear regression model: {lin_r2:.2f}")

R2 score of linear regression model: 0.85


In [21]:
mae_lin = mean_absolute_error(y_true=y_test, y_pred=lin_pred)
mae_lin

5157.279022559875

#### Training KNeighborsRegressor model

In [22]:
knn_r = KNeighborsRegressor() # completely default params
knn_r.fit(X_train_f, y_train)
knn_pred = knn_r.predict(X_test_f)

knn_r2 = r2_score(y_true=y_test, y_pred=knn_pred)
print(f"R2 score of the KNN regressor model: {knn_r2:.2f}")

R2 score of the KNN regressor model: 0.90


In [23]:
mean_absolute_error(
    y_test, knn_pred
)

3709.4491310530652

In [24]:
root_mean_squared_error(
    y_test, knn_pred
)

6455.108742871837

It's a very large dataset. So we are not going to train Support Vector Regressor on it. It's computationaly very expensive.
The KNeighborsRegressor is better than linear regression model, so we will go with the KNeighborsRegressor model.

#### Hyper parameter tuning of KNeighborsRegressor model

In [29]:
# Create a base model
knn_regressor = KNeighborsRegressor() # Base model (completely default params)

knn_params = {"n_neighbors":[5, 7],
              "weights":["uniform", "distance"],
            }

knn_tuned = GridSearchCV(
    estimator=knn_regressor,
    param_grid=knn_params,
    scoring="neg_mean_absolute_error",
    cv=5,
    verbose=2,
    n_jobs=-1 # all cpu cores for training
)

# Fit the grid instance
knn_tuned.fit(X_train_f, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [30]:
knn_tuned.best_params_

{'n_neighbors': 5, 'weights': 'distance'}

In [31]:
newknn_preds = knn_tuned.predict(X_test_f)

In [32]:
knn_tuned_mae = mean_absolute_error(
    y_true=y_test, y_pred=newknn_preds
)
knn_tuned_rmse = root_mean_squared_error(
    y_true=y_test, y_pred=newknn_preds
)

print("Performance metrics of the knn model whose hyper parameter are tuned!")
print(f"Mean absolute error: {knn_tuned_mae}")
print(f"Root mean absolute error: {knn_tuned_rmse}")

Performance metrics of the knn model whose hyper parameter are tuned!
Mean absolute error: 3605.6353439206287
Root mean absolute error: 6437.111009627264


Very slightly better than the base model.

**Note:**  
KNN algorithm is not good for very large datasets with so many entries. As there's a lot of computation in this algorithm. We still trained the knn regressor, but this is not recommended. It was done only for practice purpose. SVR is also very expensive in case of computation. So other algorithms, or just linear regression algorithm would be better. Till date, the moment at which I am writing this code, I only know till support vector machines. And I have to learn so much more. That is why I am unable to implement other machine learning algorithms.

In future I might update this project folder/repository!

Well, for now let's go with the knn regressor model and **SAVE IT!**

In [35]:
try:
    dump(value=scaler, filename='../models/scaler_obj.joblib')
    dump(value=knn_tuned, filename='../models/knn_tuned.joblib')
    print(f"Model and the scaler object have been saved!")
except Exception as e:
    print(f"Some exception occured: {e}")

Model and the scaler object have been saved!
