In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import *
from sklearn.linear_model import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from sklearn.feature_selection import *
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.svm import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
business = pd.read_csv("business.csv")
economy = pd.read_csv("economy.csv")

In [3]:
business["class"] = "Business"
economy["class"] = "Economy"
df = pd.concat([economy,business],axis=0,ignore_index=True).reset_index(drop=True)
df["dep_time"] = pd.to_datetime(df["dep_time"]).dt.hour // 4
df["arr_time"] = pd.to_datetime(df["arr_time"]).dt.hour // 4
df_1 = df.drop_duplicates(ignore_index=True).reset_index(drop=True)
df_1

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class
0,11-02-2022,SpiceJet,SG,8709,4,Delhi,02h 10m,non-stop,5,Mumbai,5953,Economy
1,11-02-2022,SpiceJet,SG,8157,1,Delhi,02h 20m,non-stop,2,Mumbai,5953,Economy
2,11-02-2022,AirAsia,I5,764,1,Delhi,02h 10m,non-stop,1,Mumbai,5956,Economy
3,11-02-2022,Vistara,UK,995,2,Delhi,02h 15m,non-stop,3,Mumbai,5955,Economy
4,11-02-2022,Vistara,UK,963,2,Delhi,02h 20m,non-stop,2,Mumbai,5955,Economy
...,...,...,...,...,...,...,...,...,...,...,...,...
300254,31-03-2022,Vistara,UK,822,2,Chennai,10h 05m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,4,Hyderabad,69265,Business
300255,31-03-2022,Vistara,UK,826,3,Chennai,10h 25m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,5,Hyderabad,77105,Business
300256,31-03-2022,Vistara,UK,832,1,Chennai,13h 50m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,5,Hyderabad,79099,Business
300257,31-03-2022,Vistara,UK,828,1,Chennai,10h 00m,1-stop\n\t\t\t\t\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t...,4,Hyderabad,81585,Business


In [4]:
def split_and_convert_to_float(time_taken):
    tt = time_taken.split(" ")

    hr = list(tt[0])
    g = hr.pop()
    hr = "".join(hr)
    if len(hr)==0:
        hr = 0
    else:
        hr = float(hr)

    min = list(tt[1])
    g = min.pop()
    min = "".join(min)
    if len(min)==0:
        min = 0
    else:
        min = float(min)

    total_time = hr + (min/60)
    return total_time


In [5]:
def clean_stop_data(dd):
    o = pd.Series()
    ctr = 0
    kk = dd.replace("\n","").replace("\t","").strip()
    if kk.count("Via") > 0:
        kk = kk[0:kk.index("Via")].strip()
    kk = kk.strip()
    return kk

In [6]:
df_2 = df_1.copy()
df_2["time_taken"] = list(map(lambda x: split_and_convert_to_float(x),df_2["time_taken"]))
df_2["stop"] = list(map(lambda x: clean_stop_data(x),df_2["stop"]))
df_2["price"] = df_2["price"].str.replace(",","").astype("float64")
df_2.drop_duplicates(ignore_index=True,inplace=True)
df_2.reset_index(drop=True,inplace=True)
df_2

Unnamed: 0,date,airline,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class
0,11-02-2022,SpiceJet,SG,8709,4,Delhi,2.166667,non-stop,5,Mumbai,5953.0,Economy
1,11-02-2022,SpiceJet,SG,8157,1,Delhi,2.333333,non-stop,2,Mumbai,5953.0,Economy
2,11-02-2022,AirAsia,I5,764,1,Delhi,2.166667,non-stop,1,Mumbai,5956.0,Economy
3,11-02-2022,Vistara,UK,995,2,Delhi,2.250000,non-stop,3,Mumbai,5955.0,Economy
4,11-02-2022,Vistara,UK,963,2,Delhi,2.333333,non-stop,2,Mumbai,5955.0,Economy
...,...,...,...,...,...,...,...,...,...,...,...,...
300254,31-03-2022,Vistara,UK,822,2,Chennai,10.083333,1-stop,4,Hyderabad,69265.0,Business
300255,31-03-2022,Vistara,UK,826,3,Chennai,10.416667,1-stop,5,Hyderabad,77105.0,Business
300256,31-03-2022,Vistara,UK,832,1,Chennai,13.833333,1-stop,5,Hyderabad,79099.0,Business
300257,31-03-2022,Vistara,UK,828,1,Chennai,10.000000,1-stop,4,Hyderabad,81585.0,Business


In [7]:
airlines = df_2[["airline","ch_code"]].value_counts().reset_index().drop(columns="count")
airlines

Unnamed: 0,airline,ch_code
0,Vistara,UK
1,Air India,AI
2,Indigo,6E
3,GO FIRST,G8
4,AirAsia,I5
5,SpiceJet,SG
6,StarAir,S5
7,Trujet,2T


In [8]:
df_3 = df_2.drop(columns="airline")

city_unique = {}
ctr = 0
for i in np.sort(df_3["from"].unique()):
    city_unique[i] = ctr
    ctr += 1
df_3["from"] = df_3["from"].map(city_unique)
df_3["to"] = df_3["to"].map(city_unique)

class_unique = {}
ctr = len(df_3["class"].unique()) - 1
for i in np.sort(df_3["class"].unique()):
    class_unique[i] = ctr
    ctr -= 1

df_3["class"] = df_3["class"].map(class_unique)
df_3

Unnamed: 0,date,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class
0,11-02-2022,SG,8709,4,2,2.166667,non-stop,5,5,5953.0,0
1,11-02-2022,SG,8157,1,2,2.333333,non-stop,2,5,5953.0,0
2,11-02-2022,I5,764,1,2,2.166667,non-stop,1,5,5956.0,0
3,11-02-2022,UK,995,2,2,2.250000,non-stop,3,5,5955.0,0
4,11-02-2022,UK,963,2,2,2.333333,non-stop,2,5,5955.0,0
...,...,...,...,...,...,...,...,...,...,...,...
300254,31-03-2022,UK,822,2,1,10.083333,1-stop,4,3,69265.0,1
300255,31-03-2022,UK,826,3,1,10.416667,1-stop,5,3,77105.0,1
300256,31-03-2022,UK,832,1,1,13.833333,1-stop,5,3,79099.0,1
300257,31-03-2022,UK,828,1,1,10.000000,1-stop,4,3,81585.0,1


In [9]:
df_4 = df_3.copy()
ctr = len(df_4["stop"].unique()) - 1
stop_unique = {}
for i in np.sort(df_4["stop"].unique()):
    stop_unique[i] = ctr
    ctr -= 1
p = list(stop_unique.keys())[0:2]
for i in p:
    stop_unique[i] = 3 - stop_unique[i]
df_4["stop"] = df_4["stop"].map(stop_unique)
df_4

Unnamed: 0,date,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class
0,11-02-2022,SG,8709,4,2,2.166667,0,5,5,5953.0,0
1,11-02-2022,SG,8157,1,2,2.333333,0,2,5,5953.0,0
2,11-02-2022,I5,764,1,2,2.166667,0,1,5,5956.0,0
3,11-02-2022,UK,995,2,2,2.250000,0,3,5,5955.0,0
4,11-02-2022,UK,963,2,2,2.333333,0,2,5,5955.0,0
...,...,...,...,...,...,...,...,...,...,...,...
300254,31-03-2022,UK,822,2,1,10.083333,1,4,3,69265.0,1
300255,31-03-2022,UK,826,3,1,10.416667,1,5,3,77105.0,1
300256,31-03-2022,UK,832,1,1,13.833333,1,5,3,79099.0,1
300257,31-03-2022,UK,828,1,1,10.000000,1,4,3,81585.0,1


In [10]:
df_5 = df_4.copy()
airline_unique = {}
ctr = 0
for i in np.sort(df_5["ch_code"].unique()):
    airline_unique[i] = ctr
    ctr += 1
df_5["ch_code"] = df_5["ch_code"].map(airline_unique)
df_5

Unnamed: 0,date,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class
0,11-02-2022,6,8709,4,2,2.166667,0,5,5,5953.0,0
1,11-02-2022,6,8157,1,2,2.333333,0,2,5,5953.0,0
2,11-02-2022,4,764,1,2,2.166667,0,1,5,5956.0,0
3,11-02-2022,7,995,2,2,2.250000,0,3,5,5955.0,0
4,11-02-2022,7,963,2,2,2.333333,0,2,5,5955.0,0
...,...,...,...,...,...,...,...,...,...,...,...
300254,31-03-2022,7,822,2,1,10.083333,1,4,3,69265.0,1
300255,31-03-2022,7,826,3,1,10.416667,1,5,3,77105.0,1
300256,31-03-2022,7,832,1,1,13.833333,1,5,3,79099.0,1
300257,31-03-2022,7,828,1,1,10.000000,1,4,3,81585.0,1


In [11]:
airline_unique

{'2T': 0, '6E': 1, 'AI': 2, 'G8': 3, 'I5': 4, 'S5': 5, 'SG': 6, 'UK': 7}

In [12]:
df_6 = df_5.copy()
df_6.drop_duplicates(ignore_index=True,inplace=True)
df_6.reset_index(drop=True,inplace=True)
df_7 = df_6.drop(columns="date")
df_7

Unnamed: 0,ch_code,num_code,dep_time,from,time_taken,stop,arr_time,to,price,class
0,6,8709,4,2,2.166667,0,5,5,5953.0,0
1,6,8157,1,2,2.333333,0,2,5,5953.0,0
2,4,764,1,2,2.166667,0,1,5,5956.0,0
3,7,995,2,2,2.250000,0,3,5,5955.0,0
4,7,963,2,2,2.333333,0,2,5,5955.0,0
...,...,...,...,...,...,...,...,...,...,...
300254,7,822,2,1,10.083333,1,4,3,69265.0,1
300255,7,826,3,1,10.416667,1,5,3,77105.0,1
300256,7,832,1,1,13.833333,1,5,3,79099.0,1
300257,7,828,1,1,10.000000,1,4,3,81585.0,1


In [13]:
X = df_7.drop(columns="price")
y = df_7["price"]

In [14]:
f_reg = f_regression(X,y)
p_values = pd.DataFrame(columns=["features","p-values"])
p_values["features"] = X.columns.values
p_values["p-values"] = f_reg[1].round(2)
p_values

Unnamed: 0,features,p-values
0,ch_code,0.0
1,num_code,0.0
2,dep_time,0.0
3,from,0.01
4,time_taken,0.0
5,stop,0.0
6,arr_time,0.0
7,to,0.01
8,class,0.0


In [15]:
mms = MinMaxScaler()
X_res = pd.DataFrame(mms.fit_transform(X),columns=X.columns)
f_res = f_regression(X_res,y)
p_values_res = pd.DataFrame(columns=["features","p-values"])
p_values_res["features"] = X_res.columns.values
p_values_res["p-values"] = f_res[1].round(2)
p_values_res

Unnamed: 0,features,p-values
0,ch_code,0.0
1,num_code,0.0
2,dep_time,0.0
3,from,0.01
4,time_taken,0.0
5,stop,0.0
6,arr_time,0.0
7,to,0.01
8,class,0.0


In [16]:
X_train,X_test,y_train,y_test = train_test_split(X_res,y,test_size=0.25,random_state=np.random.randint(0,1000))

In [17]:
regr_linear = LinearRegression()
regr_lasso = Lasso()
regr_enet = ElasticNet()
regr_ridge = Ridge()
regr_linear_svr = LinearSVR()
regr_svr_rbf = SVR(kernel="rbf")
regr_rfr = RandomForestRegressor()

In [18]:
regr_linear.fit(X_train,y_train)

In [19]:
regr_lasso.fit(X_train,y_train)

In [20]:
regr_enet.fit(X_train,y_train)

In [21]:
regr_ridge.fit(X_train,y_train)

In [ ]:
regr_linear_svr.fit(X_train,y_train)

In [ ]:
regr_svr_rbf.fit(X_train,y_train)

In [ ]:
regr_rfr.fit(X_train,y_train)

In [312]:
y_pred_linear = regr_linear.predict(X_test)
y_pred_lasso = regr_lasso.predict(X_test)
y_pred_enet = regr_enet.predict(X_test)
y_pred_ridge = regr_ridge.predict(X_test)
y_pred_linear_svr = regr_linear_svr.predict(X_test)
y_pred_svr_rbf = regr_svr_rbf.predict(X_test)
y_pred_rfr = regr_rfr.predict(X_test)

In [313]:
mae_linear = mean_absolute_error(y_test,y_pred_linear)
mae_lasso = mean_absolute_error(y_test,y_pred_lasso)
mae_enet = mean_absolute_error(y_test,y_pred_enet)
mae_ridge = mean_absolute_error(y_test,y_pred_ridge)
mae_linear_svr = mean_absolute_error(y_test,y_pred_linear_svr)
mae_svr_rbf = mean_absolute_error(y_test,y_pred_svr_rbf)
mae_rfr = mean_absolute_error(y_test,y_pred_rfr)

2109.3602395362473

In [314]:
list_of_regressors = ["Linear Regression","Lasso Regression","Elastic Net Regression","Ridge Regression","Linear Support Vector Regression","Support Vector Regression with Radial Basis Function Kernel","Random Forest Regression"]
mean_absolute_errors = [mae_linear,mae_lasso,mae_enet,mae_ridge,mae_linear_svr,mae_svr_rbf,mae_rfr]

array([ 2969.20859073, 24182.00966412,  7293.56156662, ...,
       63746.36955464,  3918.55574845,  3025.8105    ])

In [315]:
y_test

45515      5218.0
220982    24056.0
116157    10893.0
7870       7575.0
80529      3050.0
           ...   
177529     5493.0
135949    13728.0
264433    74273.0
4945       3100.0
127236     3014.0
Name: price, Length: 75065, dtype: float64