In [12]:
import numpy as np
import pandas as pd

In [13]:
df = pd.read_csv("../data/preprocessed_data.csv")

## Developing a model


In [14]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300259 entries, 0 to 300258
Data columns (total 14 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   airline         300259 non-null  object 
 1   from            300259 non-null  object 
 2   to              300259 non-null  object 
 3   price           300259 non-null  int64  
 4   class           300259 non-null  object 
 5   flight_code     300259 non-null  object 
 6   stops           300259 non-null  object 
 7   departure_time  300259 non-null  object 
 8   arrival_time    300259 non-null  object 
 9   datetime        300259 non-null  object 
 10  dow             300259 non-null  object 
 11  holiday         300259 non-null  int64  
 12  days_until      300259 non-null  int64  
 13  duration        300255 non-null  float64
dtypes: float64(1), int64(3), object(10)
memory usage: 32.1+ MB


In [16]:
# We have null values in the duration column, we will drop them
df = df.dropna()

# We also will drop the datetime column
df = df.drop(columns=["datetime"])

# Let's remove flight code as well
df = df.drop(columns=["flight_code"])

In [17]:
df.sample(3)

Unnamed: 0,airline,from,to,price,class,stops,departure_time,arrival_time,dow,holiday,days_until,duration
168173,Air India,Hyderabad,Bangalore,6734,economy,1,morning,evening,Tuesday,0,26,525.0
28331,Air India,Delhi,Kolkata,4748,economy,1,morning,afternoon,Tuesday,0,47,500.0
194083,Vistara,Chennai,Bangalore,12318,economy,1,evening,afternoon,Monday,0,4,1195.0


In [18]:
categorical_columns = [
    "airline",
    "from",
    "to",
    "class",
#    "flight_code",
    "departure_time",
    "arrival_time",
    "dow",
    "holiday",
]
numerical_columns = ["duration",
                     "days_until"]
target_column = "price"

## Splitting the data into training, validation and testing sets


In [19]:
df_train_val, df_test = train_test_split(df, test_size=0.2, random_state=57)
df_train, df_val = train_test_split(
    df_train_val, test_size=0.25, random_state=57)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [20]:
X_train = df_train[categorical_columns + numerical_columns]
X_val = df_val[categorical_columns +numerical_columns]
X_test = df_test[categorical_columns + numerical_columns]

y_train = df_train[target_column].values
y_val = df_val[target_column].values
y_test = df_test[target_column].values


### Scaling the data

In [21]:
def treat_categorical_cols(df_train: pd.DataFrame, cols: list, dv: DictVectorizer):
    df = df_train.copy()
    df[cols] = df[cols].astype(str)
    train_dict = df[cols].to_dict(orient="records")


    if dv is None:
        dv = DictVectorizer(sparse=False)
        dv.fit(train_dict)
        
    df_cat = dv.transform(train_dict)
    df_train_cat = pd.DataFrame(df_cat, columns=dv.get_feature_names_out())

    df_train_continuous = df_train.drop(columns=cols)
    df_train = pd.concat([df_train_continuous, df_train_cat], axis=1)

    return df_train, dv

In [22]:
X_train, dv = treat_categorical_cols(X_train,
                                     categorical_columns,
                                     dv=None)
X_val = treat_categorical_cols(X_val,
                               categorical_columns,
                               dv=dv)[0]
X_test = treat_categorical_cols(X_test,
                                categorical_columns,
                                dv=dv)[0]


In [23]:
def treat_numerical_columns(df_train: pd.DataFrame, cols: list, scaler: StandardScaler):
    df = df_train.copy()
    if scaler is None:
        scaler = StandardScaler()
        scaler.fit(df[cols])
    df[cols] = scaler.transform(df[cols])
    return df, scaler

In [24]:
X_train, scaler = treat_numerical_columns(X_train,
                                            numerical_columns,
                                            scaler=None)
X_val = treat_numerical_columns(X_val,
                                numerical_columns,
                                scaler=scaler)[0]
X_test = treat_numerical_columns(X_test,
                                 numerical_columns,
                                 scaler=scaler)[0]

## Training the model


In [25]:
LR = LinearRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_val)
print("Linear Regression")
print(f"RMSE: {round(np.sqrt(mean_squared_error(y_val, y_pred)), 2)}")
print(f"R2: {round(r2_score(y_val, y_pred), 2)}")
print()

RF = RandomForestRegressor(n_jobs=-1, 
                           random_state=57, )
RF.fit(X_train, y_train)
y_pred = RF.predict(X_val)
print("Random Forest")
print(f"RMSE: {round(np.sqrt(mean_squared_error(y_val, y_pred)), 2)}")
print(f"R2: {round(r2_score(y_val, y_pred), 2)}")
print()

GB = GradientBoostingRegressor(random_state=57)
GB.fit(X_train, y_train)
y_pred = GB.predict(X_val)
print("Gradient Boosting")
print(f"RMSE: {round(np.sqrt(mean_squared_error(y_val, y_pred)), 2)}")
print(f"R2: {round(r2_score(y_val, y_pred), 2)}")

Linear Regression
RMSE: 7071.62
R2: 0.9

Random Forest
RMSE: 2847.38
R2: 0.98

Gradient Boosting
RMSE: 5022.1
R2: 0.95


In [26]:
y_pred = LR.predict(X_test)
print("Linear Regression")
print(f"RMSE: {round(np.sqrt(mean_squared_error(y_test, y_pred)), 2)}")
print(f"R2: {round(r2_score(y_test, y_pred), 2)}")

y_pred = RF.predict(X_test)
print("Random Forest")
print(f"RMSE: {round(np.sqrt(mean_squared_error(y_test, y_pred)), 2)}")
print(f"R2: {round(r2_score(y_test, y_pred), 2)}")

y_pred = GB.predict(X_test)
print("Gradient Boosting")
print(f"RMSE: {round(np.sqrt(mean_squared_error(y_test, y_pred)), 2)}")
print(f"R2: {round(r2_score(y_test, y_pred), 2)}")

Linear Regression
RMSE: 7091.5
R2: 0.9
Random Forest
RMSE: 2907.46
R2: 0.98
Gradient Boosting
RMSE: 5037.78
R2: 0.95


We could check that the model that best fit our prediction is the Random Forest Regressor. We will now fine tuning this model. 

In [27]:
from sklearn.model_selection import GridSearchCV

### Grid Search


In [28]:
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [25, 50],
    "min_samples_leaf": [1,2],
    "max_features": ["sqrt"]
}

RF = RandomForestRegressor(n_jobs=-1,
                            random_state=57)
grid_search = GridSearchCV(RF,
                           param_grid,
                           cv=3,
                           verbose=1,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 3 folds for each of 12 candidates, totalling 36 fits




{'max_depth': 25,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'n_estimators': 200}

In [30]:

RF = RandomForestRegressor(n_jobs=-1,
                            random_state=57,
                            **grid_search.best_params_)
RF.fit(X_train, y_train)
y_pred = RF.predict(X_val)
print("Random Forest")
print(f"RMSE: {round(np.sqrt(mean_squared_error(y_val, y_pred)), 2)}")
print(f"R2: {round(r2_score(y_val, y_pred), 2)}")

y_pred = RF.predict(X_test)
print("Random Forest")
print(f"RMSE: {round(np.sqrt(mean_squared_error(y_test, y_pred)), 2)}")


Random Forest
RMSE: 3446.07
R2: 0.98
Random Forest
RMSE: 3488.77


Okay, let's accept that our model is better without fine tuning. We will use the model without changing hyperparameters.

In [32]:
RF = RandomForestRegressor(n_jobs=-1, 
                           random_state=57)

RF.fit(X_train, y_train)

### Saving model, DictVectorizer and StandardScaler

In [33]:
import pickle

In [34]:

with open("../data/models/RF.pkl", "wb") as f:
    pickle.dump(RF, f)

with open("../data/models/dv.pkl", "wb") as f:
    pickle.dump(dv, f)

with open("../data/models/scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)