## ***Summary of the notebook***

*In this notebook, I have trained the model to be used for inference.*

***Index***
1. Import libraries and data
2. Data Cleaning
3. One-Hot encoding
4. Training Models
5. Hyperparameter Optimization

## **<ins>Import libraries and data</ins>**

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import joblib

from sklearn.model_selection import KFold, cross_val_score
from sklearn import metrics

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

warnings.filterwarnings('ignore')

In [4]:
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/Projects/Flight price prediction - 3/data.xlsx')
data.columns = data.columns.str.lower()
data.head()

Unnamed: 0,airline,date_of_journey,source,destination,route,dep_time,arrival_time,duration,total_stops,additional_info,price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


## **<ins>Data Cleaning</ins>**
This is copied from my [Data Cleaning notebook](https://github.com/abhinavnayak11/flight-prediction/blob/main/notebooks/Data_Cleaning.ipynb)

In [5]:
# has null values
data.drop(index = 9039, inplace = True)

# outliers in 'price'
data = data[data['price'] < 40000]         

# outliers in 'airline'
mask1 = data['airline'] != 'Trujet'
mask2 = data['airline'] != 'Multiple carriers Premium economy'
mask3 = data['airline'] != 'Vistara Premium economy'
data = data[mask1 & mask2 & mask3]

data.drop(index = 5136, inplace = True)

# outliers in 'source'
data.drop(index = [10052, 268], inplace = True)

# outliers in 'destination'
data.drop(index = [2099, 2693, 5719, 2099, 2693, 5719], inplace = True)

# outliers in 'total_stops'
mask1 = data['total_stops'] != '4 stops'
mask2 = data['total_stops'] != '3 stops'
data = data[mask1 & mask2]

data.drop(index = [1629, 7617, 8598], inplace = True)

# creating new features from 'date_of_journey'
data['date_of_journey'] = pd.to_datetime(data['date_of_journey'], format='%d/%m/%Y')
data['doj_month'] = data['date_of_journey'].dt.month
data['doj_day'] = data['date_of_journey'].dt.day
data['doj_dayofweek'] = data['date_of_journey'].dt.dayofweek
data['doj_weekofyear'] = data['date_of_journey'].dt.isocalendar().week.astype(int)

# outliers in 'doj_month'
data.drop(index = [6991], inplace = True)

# outliers in 'doj_day'
data.drop(index = [396, 628, 2108], inplace = True)

# creating new features from 'dep_time'
data['dep_hour'] = data['dep_time'].apply(lambda x: re.findall(r'(\d+):\d+', x)[0]).astype(int)
data['dep_min'] = data['dep_time'].apply(lambda x: re.findall(r'\d+:(\d+)', x)[0]).astype(int)

# dropping features which are not needed
data.drop(['date_of_journey', 'dep_time', 'route', 'additional_info', 'arrival_time', 'duration'], axis = 1, inplace = True)

data.head()

Unnamed: 0,airline,source,destination,total_stops,price,doj_month,doj_day,doj_dayofweek,doj_weekofyear,dep_hour,dep_min
0,IndiGo,Banglore,New Delhi,non-stop,3897,3,24,6,12,22,20
1,Air India,Kolkata,Banglore,2 stops,7662,5,1,2,18,5,50
2,Jet Airways,Delhi,Cochin,2 stops,13882,6,9,6,23,9,25
3,IndiGo,Kolkata,Banglore,1 stop,6218,5,12,6,19,18,5
4,IndiGo,Banglore,New Delhi,1 stop,13302,3,1,4,9,16,50


## **<ins>One-Hot Encoding categorical features</ins>**

In [6]:
data['total_stops'] = data['total_stops'].map({'1 stop':1, 'non-stop':0, '2 stops':2}).astype(int)

for col in ['airline','source', 'destination']:
  cats = list(data[col].value_counts().index)
  mapping = dict(zip(cats, range(len(cats))))
  data[col] = data[col].map(mapping)

In [7]:
data

Unnamed: 0,airline,source,destination,total_stops,price,doj_month,doj_day,doj_dayofweek,doj_weekofyear,dep_hour,dep_min
0,1,2,3,0,3897,3,24,6,12,22,20
1,2,1,1,2,7662,5,1,2,18,5,50
2,0,0,0,2,13882,6,9,6,23,9,25
3,1,1,1,1,6218,5,12,6,19,18,5
4,1,2,3,1,13302,3,1,4,9,16,50
...,...,...,...,...,...,...,...,...,...,...,...
10678,6,1,1,0,4107,4,9,1,15,19,55
10679,2,1,1,0,4145,4,27,5,17,20,45
10680,0,2,2,0,7229,4,27,5,17,8,20
10681,5,2,3,0,12648,3,1,4,9,11,30


## **<ins>Training Models</ins>**

### **Split into features and label**

In [9]:
X = data.drop('price', axis = 1)
y = data['price'].values

### **Define all the models**

In [11]:
models = {
    'linearreg' : LinearRegression(),
    'ridge' : Ridge(random_state=42),
    'lasso' : Lasso(random_state=42),
    'decisiontree' : DecisionTreeRegressor(random_state=42),
    'randomforest' : RandomForestRegressor(random_state = 42), 
    'extratrees' : ExtraTreesRegressor(random_state = 42),
    'lightgbm' : LGBMRegressor(random_state=42),
    'catboost' : CatBoostRegressor(random_state = 42, verbose=0),
    'xgboost' : XGBRegressor(random_state = 42, objective ='reg:squarederror')
}

### **Comparing performance of different models**

In [13]:
model_score = []
for model_name in models.keys():
  print(f"Training {model_name}...", end=" ")
  model = models[model_name]
  cv = KFold(n_splits = 5, shuffle = True, random_state = 42)
  scores = cross_val_score(model, X, y, cv = cv)
  avg = scores.mean()
  model_score.append(avg)
  print("Done")

Training linearreg... Done
Training ridge... Done
Training lasso... Done
Training decisiontree... Done
Training randomforest... Done
Training extratrees... Done
Training lightgbm... Done
Training catboost... Done
Training xgboost... Done


In [14]:
df = pd.DataFrame()
df['model'] = list(models.keys())
df['score'] = model_score
df.sort_values('score')

Unnamed: 0,model,score
2,lasso,0.50076
1,ridge,0.50499
0,linearreg,0.510984
3,decisiontree,0.786192
8,xgboost,0.786575
5,extratrees,0.818534
4,randomforest,0.834206
6,lightgbm,0.844629
7,catboost,0.858038


Catboost is giving best results. We will tune it first and then save the model for inference

## **<ins>Hyperparameter Optimization</ins>**

### **GridSearchCV**

In [27]:
param_grid = {
    'depth' : [3,6,9,11],
    'learning_rate' : [1e-2, 1e-1, 1.0]
}

In [28]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

model = CatBoostRegressor(random_state = 42, verbose=0)

cv = KFold(n_splits = 5, shuffle = True, random_state = 42)
grid_model = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'r2', cv = cv, n_jobs = -1)
grid_model.fit(X, y)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             error_score=nan,
             estimator=<catboost.core.CatBoostRegressor object at 0x7f8488dcabd0>,
             iid='deprecated', n_jobs=-1,
             param_grid={'depth': [3, 6, 9, 11],
                         'learning_rate': [0.01, 0.1, 1.0]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='r2', verbose=0)

In [29]:
print(f"Best score: {grid_model.best_score_}")
print(f"Best params: {grid_model.best_params_}")

Best score: 0.8593965490268852
Best params: {'depth': 6, 'learning_rate': 0.1}


The model's score has increased from 0.8580 to 0.85939

### **RandomizedSearchCV**

In [23]:
param_distributions = {
    'depth' : [3,6,9,11],
    'learning_rate' : [1e-3, 1e-2, 1e-1, 1.0],
    'min_data_in_leaf' : [5, 20, 60, 100]
}

In [25]:
model = CatBoostRegressor(random_state = 42, verbose=0)

cv = KFold(n_splits = 5, shuffle = True, random_state = 42)
randomized_model = RandomizedSearchCV(estimator = model, param_distributions = param_distributions, n_iter = 15, scoring = 'r2', cv = cv, n_jobs = -1)
randomized_model.fit(X, y)

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
                   error_score=nan,
                   estimator=<catboost.core.CatBoostRegressor object at 0x7f8488ee3ad0>,
                   iid='deprecated', n_iter=15, n_jobs=-1,
                   param_distributions={'depth': [3, 6, 9, 11],
                                        'learning_rate': [0.001, 0.01, 0.1,
                                                          1.0],
                                        'min_data_in_leaf': [5, 20, 60, 100]},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False, scoring='r2', verbose=0)

In [26]:
print(f"Best score: {randomized_model.best_score_}")
print(f"Best params: {randomized_model.best_params_}")

Best score: 0.8593965490268852
Best params: {'min_data_in_leaf': 5, 'learning_rate': 0.1, 'depth': 6}


The model's score has increased from 0.8580 to 0.85939 though it is marginally lower than best score from GridSearchCV

## **<ins>Training model on the whole dataset</ins>**

We will train the best performing model with its best parameters on the whole dataset again.   
This generally increases the performance of model, as it trains on more data.

In [31]:
from sklearn.model_selection import train_test_split

model = CatBoostRegressor(learning_rate = 0.1, depth = 6, random_state = 42, verbose = 0)
model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x7f8488db3750>

### **Save model for inference**

In [32]:
joblib.dump(model, 'model.pkl')

['model.pkl']

___