# Data Modelling

In [1]:
import pandas as pd
import numpy as np
import datetime
import os

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, LogisticRegression, LogisticRegressionCV, Lasso, LassoCV, Ridge
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

### Get the Data

In [2]:
all_nyc_files = []
all_sao_files = []

def get_files(destination: str="nyc", file_names: list = all_nyc_files) -> list:
    """
    This function retrieves the names of all .csv files in a specified directory.

    Args:
    -----
    destination (str): The destination city for which to retrieve the file names. 
    (Default sets to "nyc")
    
    file_names (list): The list to populate with the file names. 
    (Default sets to all_nyc_files)
    
    Return:
    -------
    list: A list of file names for all .csv files in the specified directory.
    """

    file_path = f'..\\webscraping\\bxl_to_{destination}'

    for file_name in os.listdir(file_path):
        # Split the file name into a base name and an extension
        base_name, extension = os.path.splitext(file_name)
        
        # Check if the file has a .csv extension
        if extension == '.csv':
            # Append the file name to the list
            file_names.append(file_name)
        
    return(file_names)
 
all_nyc_files = get_files()
all_sao_files = get_files(destination="sao", file_names=all_sao_files)

In [3]:
def read_csv_files(destination: str = "nyc", file_names: list[str] = all_nyc_files, result = None):
    """
    This function concatenates multiple CSV files into a single DataFrame. 
    If the result is None, it creates a list of DataFrames from the CSV files and concatenates them. 
    If the result is not None, it appends the DataFrames from the CSV files to the result.

    Args:
    -----
    result : The DataFrame to append the data to. If None, a new DataFrame is created.
 
    dates (list): The list of dates to use for the filenames of the CSV files.
    
    destination (str): The destination to use for the filenames of the CSV files.
 
    Return:
    -------
    DataFrame: The concatenated DataFrame.
    """
    if result is None:
        dfs = []
        for name in file_names:
            file_path = f"..\\webscraping\\bxl_to_{destination}\\{name}"
            df = pd.read_csv(file_path)
            dfs.append(df)
        result = pd.concat(dfs, axis=0, ignore_index=True)
    else:
        for name in file_names:
            filename = f"..\\webscraping\\bxl_to_{destination}\\booking_{name}.csv"
            df = pd.read_csv(filename)
            result = pd.concat([result, df], axis=0, ignore_index=True)
    return result


In [4]:
def convert_time(time):
    hour = int(time.split(':')[0])
    if 0 <= hour < 12:
        return 0
    else:
        return 1

In [5]:
def transfomed_df(df):
    """
    This function transforms a DataFrame by splitting and renaming columns, 
    converting data types, and applying functions to columns.

    Args:
    -----
    df : The DataFrame to transform.
 
    Return:
    -------
    DataFrame: The transformed DataFrame.
    """

    air_cols = [col for col in df.columns if col.endswith('_airline_company')]
    df[air_cols] = df[air_cols].astype(str)
    df[air_cols] = df[air_cols].apply(lambda x: x.str.split(",").str[0])
    
    df['out_stop_num'] = df['out_stop_num'].str.split(' ').str[0]
    df['in_stop_num'] = df['in_stop_num'].str.split(' ').str[0]
    df['out_stop_num'] = df['out_stop_num'].astype(float)
    df['in_stop_num'] = df['in_stop_num'].astype(float)

    df["tot_stop"] = df['out_stop_num'] + df['in_stop_num']


    split_df = df.pop('price_ticket').str.rsplit(' ', n=1, expand=True).rename(columns={0: 'ticket_price', 1: 'currency'})
    df = df.join(split_df)
    df['ticket_price'] = df['ticket_price'].str.replace(',', '.').str.replace(' ', '').astype(float)

    date_cols = [col for col in df.columns if col.endswith('_date')]
    year= '2023'
    for col in date_cols:
        df[col] = pd.to_datetime(df[col] + ' ' + year, format='%b %d %Y')
    
    time_cols = [col for col in df.columns if col.endswith('_time')]
    df[time_cols] = df[time_cols].apply(lambda x: pd.to_datetime(x, format='%I:%M %p').dt.strftime('%H:%M'))

    duration_cols = [col for col in df.columns if col.endswith('_duration')]
    df[duration_cols] = df[duration_cols].applymap(lambda x: pd.to_timedelta(x.replace('h', ' hours ').replace('m', ' min')))

    df['tot_duration'] = df["out_duration"] + df["in_duration"]
    df['tot_duration_seconds'] = df['tot_duration'].dt.total_seconds()
    
    
    df["airline_company"] = df["out_airline_company"]
    df['airline_company_dummy'] = (df['airline_company'] == 'Swiss').astype(int)

    df['destination_dummy'] = (df['arr_city'] == 'JFK').astype(int)

    for col in ['out_dep_time', 'out_arr_time', 'in_dep_time', 'in_arr_time']:
        df[col + '_dummy'] = df[col].apply(convert_time)

    year = 2023
    month = "May"
    
    # Construct a date string for each row
    df['date'] = df['day_scrap'].apply(lambda x: f'{year}-{month}-{x:02d}')

    # Convert the date column to a datetime object
    df['date'] = pd.to_datetime(df['date'])

    # Extract the day of the week
    df['day_of_week'] = df['date'].dt.day_name()
    
    return df

In [6]:
# Get the dataframe for nyc
df1 = read_csv_files()
df1 = transfomed_df(df1)
df1 = df1.dropna()
df1 = df1.reset_index(drop=True)

# Get the dataframe for sao
df2 = read_csv_files(destination="sao", file_names=all_sao_files)
df2 = transfomed_df(df2)
df2 = df2.dropna()
df2 = df2.reset_index(drop=True)

# merge dataframe
df = pd.concat([df1, df2])

In [7]:
def get_data(df, var_X, var_y):
    """
    Extracts the independent and dependent variables from a given DataFrame.

    Parameters:
    -----------
    df: A DataFrame containing the data.
    var_X: A list of column names representing the independent variables.
    var_y: The column name representing the dependent variable.
    
    Return:
    ------- 
    A tuple containing a DataFrame of the independent variables and a Series of the dependent variable.
    """
    X = df[var_X]
    y = df[var_y]
    return X, y

In [8]:
# create scatter plots of ticket_price against each independent variable
def scatter_plot(df = df1):
    fig, axs = plt.subplots(2, 3, figsize=(15, 10))
    axs[0, 0].scatter(df['tot_duration'].dt.total_seconds(), df['ticket_price'])
    axs[0, 0].set_xlabel('tot_duration')
    axs[0, 0].set_ylabel('ticket_price')
    axs[0, 1].scatter(df['hour_scrap'], df['ticket_price'])
    axs[0, 1].set_xlabel('hour_scrap')
    axs[0, 1].set_ylabel('ticket_price')
    axs[0, 2].scatter(df['day_scrap'], df['ticket_price'])
    axs[0, 2].set_xlabel('day_scrap')
    axs[0, 2].set_ylabel('ticket_price')
    axs[1, 0].scatter(df['tot_stop'], df['ticket_price'])
    axs[1, 0].set_xlabel('tot_stop')
    axs[1, 0].set_ylabel('ticket_price')
    axs[1, 1].scatter(df['airline_company_dummy'], df['ticket_price'])
    axs[1, 1].set_xlabel('airline_company_dummy')
    axs[1, 1].set_ylabel('ticket_price')

    plt.show()

## Logistic Regression

In [9]:
var_X = ['tot_duration_seconds', 'hour_scrap', 'day_scrap','ticket_price','destination_dummy', 'out_dep_time_dummy', 'in_dep_time_dummy', 'out_dep_time_dummy', 'in_arr_time_dummy']
var_y = ['airline_company_dummy']

X, y = get_data(df, var_X, var_y)

In [10]:
# split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create and fit the model using the training data
model = LogisticRegression()
model.fit(X_train, y_train)

# make predictions on the test data
y_pred = model.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.64


  y = column_or_1d(y, warn=True)


In [11]:
# create and fit the model using the training data
model = LogisticRegressionCV()
model.fit(X_train, y_train)

# print the best C value found by cross-validation
print(f'Best C: {model.C_[0]:.2f}')

# make predictions on the test data
y_pred = model.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

  y = column_or_1d(y, warn=True)


Best C: 10000.00
Accuracy: 0.63


## Linear Regression

### 1. Test

In [12]:
var_X = ['tot_duration_seconds', 'hour_scrap', 'day_scrap','airline_company_dummy', 'destination_dummy','out_dep_time_dummy', 'out_arr_time_dummy', 'in_dep_time_dummy', 'in_arr_time_dummy']
var_y = ['ticket_price']

X_1, y_1 = get_data(df, var_X, var_y)
X_2, y_2 = get_data(df1, var_X, var_y)
X_3, y_3 = get_data(df2, var_X, var_y)

In [13]:
def lin_reg(X=X_1, y=y_1, model=LinearRegression()):
    """
    This function fits a linear regression model to the data and returns a DataFrame with the evaluation metrics.

    Parameters:
    -----------
    X (DataFrame): The independent variables.
                    (Default : X_1)
    y (DataFrame): The dependent variable.
                    (Deafukt : y)
    model (estimator): The model to fit. Default is LinearRegression.

    Returns:
    --------
    DataFrame: A DataFrame with the evaluation metrics.
    """
    # split the data into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model1 = model
    model1.fit(X_train, y_train)

    # Calculate the training error
    y_train_pred = model1.predict(X_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_rmse = np.sqrt(train_mse)

    # Calculate the test error
    y_test_pred = model1.predict(X_test)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)

    # Perform 10-fold cross-validation
    scores = cross_val_score(model1, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
    rmse_scores = np.sqrt(-scores)

    # Calculate the mean cross-validation RMSE
    cv_rmse = rmse_scores.mean()

    # Make predictions on the test data
    y_test_pred = model1.predict(X_test)

    # Calculate the evaluation metrics
    mae = mean_absolute_error(y_test, y_test_pred)
    mse = mean_squared_error(y_test, y_test_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_test_pred)

    # Create a dictionary with the results
    results = {'Intercept': [model1.intercept_],
               'Training RMSE': [train_rmse],
               'Test RMSE': [test_rmse],
               'Cross-validation RMSE': [cv_rmse],
               'Mean Absolute Error': [mae],
               'Mean Squared Error': [mse],
               'Root Mean Squared Error': [rmse],
               'R-squared': [r2]}

    # Convert the dictionary into a pandas DataFrame
    results_df = pd.DataFrame(results)
    results_df = results_df.transpose()
    results_df = results_df.rename(columns={0: 'Values'})
    pd.set_option('display.max_colwidth', None)

    return results_df

In [14]:
def ols(X=X_1, y=y_1, model=sm):
    """
    This function fits a linear regression model to the data and returns a DataFrame with the coefficients and p-values of the fitted model.

    Parameters:
    -----------
    X (DataFrame): The independent variables.
                    (Default : X_1)
    y (DataFrame): The dependent variable.
                    (Default : y_1)
    model (module): The statsmodels module to use for fitting the model. Default is statsmodels.api.

    Returns:
    --------
    DataFrame: A DataFrame with the coefficients and p-values of the fitted model.
    """
    # split the data into a training set and a test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Add a constant term to the independent variables to include an intercept in the model
    X_train = model.add_constant(X_train)

    # Fit the OLS model using statsmodels to get the p-values
    ols_model = model.OLS(y_train, X_train)
    ols_results = ols_model.fit()
    p_values = ols_results.pvalues
    coefficients = ols_results.params

    # Create a DataFrame with the coefficients and p-values
    results_df = pd.DataFrame({'Coefficients': coefficients, 'p-values': p_values})

    return results_df

#### 1.1 Model built on the whole DataFrame for both destination (Sao Paulo and New York)

In [15]:
lin_reg()

Unnamed: 0,Values
Intercept,[2097.490971206482]
Training RMSE,522.364425
Test RMSE,524.069444
Cross-validation RMSE,516.31323
Mean Absolute Error,171.879278
Mean Squared Error,274648.782087
Root Mean Squared Error,524.069444
R-squared,0.552468


The lin_reg function was used to fit a linear regression model to the data in X_1 and y_1, and a DataFrame containing the evaluation metrics for the model was obtained. The evaluation metrics show how well the model fits the data and how well it generalizes to new data.

The fitted linear regression model has an intercept of 2097.49. The root mean squared error (RMSE) on the training data is 522.36, indicating that the model fits the training data reasonably well. The RMSE on the test data is 524.07, which is similar to the training RMSE, indicating that the model generalizes well to new data. The mean RMSE obtained by performing 10-fold cross-validation on the training data is 516.31, providing further evidence that the model generalizes well to new data.

The mean absolute error (MAE) on the test data is 171.88, indicating that on average, the predicted ticket prices are off by about $171.88. The mean squared error (MSE) on the test data is 274648.78, and its square root (the RMSE) is 524.07. The coefficient of determination (R-squared) on the test data is 0.55, indicating that the model explains about 55% of the variance in the ticket price.

Previous Result: 

Based on these results, it appears that the model is performing well on the training data, with a training RMSE of 163.96 and an R-squared value of 0.93. This indicates that it is able to explain 93% of the variance in the target variable on the training data.

The test RMSE is lower than the training RMSE, which suggests that the model is generalizing well to new data. However, the mean cross-validation RMSE is higher than both the training and test RMSE. This may indicate that there is some variation in the model’s performance between different folds of the cross-validation.

The coefficients of the model show the relationship between each feature and the target variable. For example, the coefficient for tot_duration_seconds is -1.83e-03, which indicates that an increase in tot_duration_seconds is associated with a small decrease in the target variable.

The intercept of the model is 1824.78, which represents the expected value of the target variable when all the features are equal to zero.

In [16]:
ols()

Unnamed: 0,Coefficients,p-values
tot_duration_seconds,-0.001168,5.714598e-08
hour_scrap,-2.310806,0.05003298
day_scrap,16.999159,2.375499e-24
airline_company_dummy,-287.397109,2.0580490000000003e-128
destination_dummy,-602.639659,0.0
out_dep_time_dummy,-159.070651,1.911618e-33
out_arr_time_dummy,-602.639659,0.0
in_dep_time_dummy,2097.490971,1.613956e-219
in_arr_time_dummy,77.927201,2.13422e-09


Coefficeints of the independent variables and their p-values:

- `tot_duration_seconds`: The coefficient for this variable is -0.001168 and its p-value is very small (5.71e-08), indicating that there is a statistically significant negative relationship between `tot_duration_seconds` and `ticket_price`. This means that as `tot_duration_seconds` increases, `ticket_price` tends to decrease.

- `hour_scrap`: The coefficient for this variable is -2.31 and its p-value is 0.050, indicating that there may be a negative relationship between `hour_scrap` and `ticket_price`, but the evidence for this relationship is not very strong.

- `day_scrap`: The coefficient for this variable is 17.00 and its p-value is very small (2.38e-24), indicating that there is a statistically significant positive relationship between `day_scrap` and `ticket_price`. This means that as `day_scrap` increases (i.e., as the day of the week changes from Monday to Sunday), `ticket_price` tends to increase.

- `airline_company_dummy`: The coefficient for this variable is -287.40 and its p-value is very small (2.06e-128), indicating that there is a statistically significant difference in ticket prices between the two airline companies represented by this variable.

- `destination_dummy`: The coefficient for this variable is -602.64 and its p-value is very small (0.000), indicating that there is a statistically significant difference in ticket prices between the two destinations represented by this variable.

- `out_dep_time_dummy`: The coefficient for this variable is -159.07 and its p-value is very small (1.91e-33), indicating that there is a statistically significant difference in ticket prices between the two departure times from the origin represented by this variable.

- `out_arr_time_dummy`: The coefficient for this variable is -602.64 and its p-value is very small (0.000), indicating that there is a statistically significant difference in ticket prices between the two arrival times at the destination represented by this variable.

- `in_dep_time_dummy`: The coefficient for this variable is 2097.49 and its p-value is very small (1.61e-219), indicating that there is a statistically significant difference in ticket prices between the two departure times from the destination represented by this variable.

- `in_arr_time_dummy`: The coefficient for this variable is 77.93 and its p-value is very small (2.13e-09), indicating that there is a statistically significant difference in ticket prices between the two arrival times at the origin represented by this variable.

#### 1.2 Model built on df1, the DataFrame for the destination New York

In [17]:
lin_reg(X = X_2, y=y_2)

Unnamed: 0,Values
Intercept,[1289.328050695435]
Training RMSE,361.315981
Test RMSE,283.967999
Cross-validation RMSE,343.231675
Mean Absolute Error,95.084857
Mean Squared Error,80637.824737
Root Mean Squared Error,283.967999
R-squared,0.115666


The fitted linear regression model has an intercept of 1289.33. The root mean squared error (RMSE) on the training data is 361.32, indicating that the model fits the training data reasonably well. The RMSE on the test data is 283.97, which is lower than the training RMSE, indicating that the model generalizes well to new data. The mean RMSE obtained by performing 10-fold cross-validation on the training data is 343.23, providing further evidence that the model generalizes well to new data.

The mean absolute error (MAE) on the test data is 95.08, indicating that on average, the predicted values are off by about 95.08 units. The mean squared error (MSE) on the test data is 80637.82, and its square root (the RMSE) is 283.97. The coefficient of determination (R-squared) on the test data is 0.12, indicating that the model explains about 12% of the variance in the dependent variable.

Previous result:

Based on these results, it appears that the model is performing well on the training data, with a training RMSE of 123.12 and an R-squared value of 0.20. This indicates that the model is able to explain 20% of the variance in the target variable on the training data.

However, the test RMSE is much higher than the training RMSE and the mean cross-validation RMSE. This may indicate that the model is not performing as well on the test data as it is on the training data.

The coefficients of the model show the relationship between each feature and the target variable. For example, the coefficient for tot_duration_seconds is -1.99e-03, which indicates that an increase in tot_duration_seconds is associated with a small decrease in the target variable.

The intercept of the model is 1370.54, which represents the expected value of the target variable when all the features are equal to zero.

In [18]:
ols(X=X_2, y=y_2)

Unnamed: 0,Coefficients,p-values
tot_duration_seconds,-0.002042,2.41468e-14
hour_scrap,0.3556,0.758478
day_scrap,2.539276,0.1335158
airline_company_dummy,-158.298614,3.740742e-37
destination_dummy,429.776017,2.18121e-105
out_dep_time_dummy,-35.738531,0.02109392
out_arr_time_dummy,429.776017,2.18121e-105
in_dep_time_dummy,429.776017,2.18121e-105
in_arr_time_dummy,-22.951324,0.05043646


Coefficeints of the independent variables and their p-values:

- `tot_duration_seconds`: The coefficient for this variable is 0.000935 and its p-value is 0.198, indicating that there is no statistically significant relationship between `tot_duration_seconds` and `ticket_price`.

- `hour_scrap`: The coefficient for this variable is -6.26 and its p-value is 0.002, indicating that there is a statistically significant negative relationship between `hour_scrap` and `ticket_price`. This means that as `hour_scrap` increases, `ticket_price` tends to decrease.

- `day_scrap`: The coefficient for this variable is 32.06 and its p-value is very small (5.42e-29), indicating that there is a statistically significant positive relationship between `day_scrap` and `ticket_price`. This means that as `day_scrap` increases (i.e., as the day of the week changes from Monday to Sunday), `ticket_price` tends to increase.

- `airline_company_dummy`: The coefficient for this variable is -484.31 and its p-value is very small (1.72e-119), indicating that there is a statistically significant difference in ticket prices between the two airline companies represented by this variable.

- `destination_dummy`: The coefficient for this variable is -665.45 and its p-value is very small (1.26e-51), indicating that there is a statistically significant difference in ticket prices between the two destinations represented by this variable.

- `out_dep_time_dummy`: The coefficient for this variable is -272.08 and its p-value is very small (3.93e-21), indicating that there is a statistically significant difference in ticket prices between the two departure times from the origin represented by this variable.

- `out_arr_time_dummy`: The coefficient for this variable is -665.45 and its p-value is very small (1.26e-51), indicating that there is a statistically significant difference in ticket prices between the two arrival times at the destination represented by this variable.

- `in_dep_time_dummy`: The coefficient for this variable is 1451.32 and its p-value is very small (2.51e-18), indicating that there is a statistically significant difference in ticket prices between the two departure times from the destination represented by this variable.

- `in_arr_time_dummy`: The coefficient for this variable is 328.40 and its p-value is very small (1.73e-13), indicating that there is a statistically significant difference in ticket prices between the two arrival times at the origin represented by this variable.

#### 1.3 Model built on the df2, the DataFrame for the destination Sao Paulo

In [19]:
lin_reg(X=X_3, y=y_3)

Unnamed: 0,Values
Intercept,[1451.3234684428805]
Training RMSE,622.389593
Test RMSE,663.860724
Cross-validation RMSE,620.788234
Mean Absolute Error,248.041631
Mean Squared Error,440711.060907
Root Mean Squared Error,663.860724
R-squared,0.200667


The fitted linear regression model has an intercept of 1451.32. The root mean squared error (RMSE) on the training data is 622.39, indicating that the model fits the training data reasonably well. The RMSE on the test data is 663.86, which is higher than the training RMSE, indicating that the model may not generalize as well to new data. The mean RMSE obtained by performing 10-fold cross-validation on the training data is 620.79, providing further evidence that the model may not generalize as well to new data.

The mean absolute error (MAE) on the test data is 248.04, indicating that on average, the predicted values are off by about 248.04 units. The mean squared error (MSE) on the test data is 440711.06, and its square root (the RMSE) is 663.86. The coefficient of determination (R-squared) on the test data is 0.20, indicating that the model explains about 20% of the variance in the dependent variable.

Previous result:

Based on the results, it appears that the model is performing well on both the training and test data. The training and test RMSE are similar, and the R-squared value is 0.56, indicating that the model is able to explain 56% of the variance in the target variable.

The mean cross-validation RMSE is also similar to the training and test RMSE, which suggests that the model is generalizing well to new data.

The intercept of the model is 871.25, which represents the expected value of the target variable when all the features are equal to zero.

In [20]:
ols(X=X_3, y=y_3)

Unnamed: 0,Coefficients,p-values
tot_duration_seconds,0.000935,0.1984531
hour_scrap,-6.262701,0.001679519
day_scrap,32.058964,5.421275e-29
airline_company_dummy,-484.31389,1.724131e-119
destination_dummy,-665.449501,1.2584729999999999e-51
out_dep_time_dummy,-272.078254,3.926567e-21
out_arr_time_dummy,-665.449501,1.2584729999999999e-51
in_dep_time_dummy,1451.323468,2.512953e-18
in_arr_time_dummy,328.402755,1.732375e-13


Coefficeints of the independent variables and their p-values:

- `tot_duration_seconds`: The coefficient for this variable is 0.000935 and its p-value is 0.198, indicating that there is no statistically significant relationship between `tot_duration_seconds` and `ticket_price`.

- `hour_scrap`: The coefficient for this variable is -6.26 and its p-value is 0.002, indicating that there is a statistically significant negative relationship between `hour_scrap` and `ticket_price`. This means that as `hour_scrap` increases, `ticket_price` tends to decrease.

- `day_scrap`: The coefficient for this variable is 32.06 and its p-value is very small (5.42e-29), indicating that there is a statistically significant positive relationship between `day_scrap` and `ticket_price`. This means that as `day_scrap` increases (i.e., as the day of the week changes from Monday to Sunday), `ticket_price` tends to increase.

- `airline_company_dummy`: The coefficient for this variable is -484.31 and its p-value is very small (1.72e-119), indicating that there is a statistically significant difference in ticket prices between the two airline companies represented by this variable.

- `destination_dummy`: The coefficient for this variable is -665.45 and its p-value is very small (1.26e-51), indicating that there is a statistically significant difference in ticket prices between the two destinations represented by this variable.

- `out_dep_time_dummy`: The coefficient for this variable is -272.08 and its p-value is very small (3.93e-21), indicating that there is a statistically significant difference in ticket prices between the two departure times from the origin represented by this variable.

- `out_arr_time_dummy`: The coefficient for this variable is -665.45 and its p-value is very small (1.26e-51), indicating that there is a statistically significant difference in ticket prices between the two arrival times at the destination represented by this variable.

- `in_dep_time_dummy`: The coefficient for this variable is 1451.32 and its p-value is very small (2.51e-18), indicating that there is a statistically significant difference in ticket prices between the two departure times from the destination represented by this variable.

- `in_arr_time_dummy`: The coefficient for this variable is 328.40 and its p-value is very small (1.73e-13), indicating that there is a statistically significant difference in ticket prices between the two arrival times at the origin represented by this variable.

### 1.4 Conclusion

Three linear regression models were fitted to different sets of data and their performance was evaluated using several metrics. The first model, with an intercept of 2097.49, had the lowest root mean squared error (RMSE) on both the training data (522.36) and the test data (524.07), indicating good fit and generalization to new data. The mean RMSE obtained by performing 10-fold cross-validation on the training data was also the lowest for this model (516.31), providing further evidence of good generalization.

The second model, with an intercept of 1289.33, had a lower RMSE on the test data (283.97) compared to the third model (663.86), indicating better generalization to new data. However, its RMSE on the training data (361.32) was higher than that of the first model (522.36), suggesting a less optimal fit to the data.

In conclusion, based on the evaluation metrics provided, the first linear regression model appears to have the best performance and would be the recommended choice for use.

## 2. The clean version

Modification of the previous model by removing 2 independent variables (out_arr_time_dummy and in_dep_time_dummy ). The same interpretations can be said here.

In [21]:
new_var_X = ['tot_duration_seconds', 'hour_scrap', 'day_scrap','airline_company_dummy', 'destination_dummy','out_dep_time_dummy', 'in_arr_time_dummy']

X_4, y_4 = get_data(df, new_var_X, var_y)
X_5, y_5 = get_data(df1, new_var_X, var_y)
X_6, y_6 = get_data(df2, new_var_X, var_y)

#### 2.1 Model built on the whole DataFrame

In [22]:
lin_reg(X=X_4, y=y_4)

Unnamed: 0,Values
Intercept,[2097.4909712064773]
Training RMSE,522.364425
Test RMSE,524.069444
Cross-validation RMSE,516.31323
Mean Absolute Error,171.879278
Mean Squared Error,274648.782087
Root Mean Squared Error,524.069444
R-squared,0.552468


The results of a multiple regression model are presented. The model includes an intercept term with a value of 2097.4909712064773. The model’s performance was evaluated using several metrics. The root mean squared error (RMSE) on the training data was 522.364425, while the RMSE on the test data was 524.069444. The RMSE obtained through cross-validation was 516.31323. The mean absolute error (MAE) was 171.879278, the mean squared error (MSE) was 274648.782087, and the R-squared value was 0.552468.

These results indicate that the model is able to explain approximately 55% of the variance in the dependent variable. The RMSE values suggest that the model’s predictions are, on average, approximately 522-524 units away from the true values.

In [23]:
ols(X=X_4, y=y_4)

Unnamed: 0,Coefficients,p-values
const,2097.490971,1.613956e-219
tot_duration_seconds,-0.001168,5.714598e-08
hour_scrap,-2.310806,0.05003298
day_scrap,16.999159,2.375499e-24
airline_company_dummy,-287.397109,2.0580490000000003e-128
destination_dummy,-1205.279318,0.0
out_dep_time_dummy,-159.070651,1.911618e-33
in_arr_time_dummy,77.927201,2.13422e-09


The intercept term (const) has a coefficient of 2097.490971 and a p-value of 1.613956e-219. The tot_duration_seconds variable has a coefficient of -0.001168 and a p-value of 5.714598e-08. The hour_scrap variable has a coefficient of -2.310806 and a p-value of 5.003298e-02. The day_scrap variable has a coefficient of 16.999159 and a p-value of 2.375499e-24. The airline_company_dummy variable has a coefficient of -287.397109 and a p-value of 2.058049e-128. The destination_dummy variable has a coefficient of -1205.279318 and a p-value of 0.000000e+00. The out_dep_time_dummy variable has a coefficient of -159.070651 and a p-value of 1.911618e-33. The in_arr_time_dummy variable has a coefficient of 77.927201 and a p-value of 2.134220e-09.

The coefficients represent the change in the dependent variable associated with a one-unit change in the corresponding independent variable, holding all other independent variables constant. The p-values represent the probability of observing the corresponding coefficients by chance if the true population value was zero (i.e., if there was no relationship between the independent variable and the dependent variable). Small p-values (typically less than 0.05) indicate that the corresponding independent variables are statistically significant predictors in the model. 

#### 2.2 Model built on the df1, DataFrame for the destination New York

In [24]:
lin_reg(X=X_5, y=y_5)

Unnamed: 0,Values
Intercept,[1289.3280506954334]
Training RMSE,361.315981
Test RMSE,283.967999
Cross-validation RMSE,343.231675
Mean Absolute Error,95.084857
Mean Squared Error,80637.824737
Root Mean Squared Error,283.967999
R-squared,0.115666


The results of a multiple regression model are presented. The model includes an intercept term with a value of 1289.3280506954334. The model’s performance was evaluated using several metrics. The root mean squared error (RMSE) on the training data was 361.315981, while the RMSE on the test data was 283.967999. The RMSE obtained through cross-validation was 343.231675. The mean absolute error (MAE) was 95.084857, the mean squared error (MSE) was 80637.824737, and the R-squared value was 0.115666.

These results indicate that the model is able to explain approximately 11% of the variance in the dependent variable. The RMSE values suggest that the model’s predictions are, on average, approximately 283-361 units away from the true values.

In [25]:
ols(X=X_5, y=y_5)

Unnamed: 0,Coefficients,p-values
tot_duration_seconds,-0.002042,2.41468e-14
hour_scrap,0.3556,0.758478
day_scrap,2.539276,0.1335158
airline_company_dummy,-158.298614,3.740742e-37
destination_dummy,1289.328051,2.18121e-105
out_dep_time_dummy,-35.738531,0.02109392
in_arr_time_dummy,-22.951324,0.05043646


The tot_duration_seconds variable has a coefficient of -0.002042 and a p-value of 2.414680e-14. The hour_scrap variable has a coefficient of 0.355600 and a p-value of 7.584780e-01. The day_scrap variable has a coefficient of 2.539276 and a p-value of 1.335158e-01. The airline_company_dummy variable has a coefficient of -158.298614 and a p-value of 3.740742e-37. The destination_dummy variable has a coefficient of 1289.328051 and a p-value of 2.181210e-105. The out_dep_time_dummy variable has a coefficient of -35.738531 and a p-value of 2.109392e-02. The in_arr_time_dummy variable has a coefficient of -22.951324 and a p-value of 5.043646e-02.

#### 2.3 Model built on the df2, DataFrame for the destination Sao Paulo

In [26]:
lin_reg(X=X_6, y=y_6)

Unnamed: 0,Values
Intercept,[1451.3234684428835]
Training RMSE,622.389593
Test RMSE,663.860724
Cross-validation RMSE,620.788234
Mean Absolute Error,248.041631
Mean Squared Error,440711.060907
Root Mean Squared Error,663.860724
R-squared,0.200667


The results of a multiple regression model are presented. The model includes an intercept term with a value of 1451.3234684428835. The model’s performance was evaluated using several metrics. The root mean squared error (RMSE) on the training data was 622.389593, while the RMSE on the test data was 663.860724. The RMSE obtained through cross-validation was 620.788234. The mean absolute error (MAE) was 248.041631, the mean squared error (MSE) was 440711.060907, and the R-squared value was 0.200667.

These results indicate that the model is able to explain approximately 20% of the variance in the dependent variable. The RMSE values suggest that the model’s predictions are, on average, approximately 622-663 units away from the true values.

In [27]:
ols(X=X_6, y=y_6)

Unnamed: 0,Coefficients,p-values
const,1451.323468,2.512953e-18
tot_duration_seconds,0.000935,0.1984531
hour_scrap,-6.262701,0.001679519
day_scrap,32.058964,5.421275e-29
airline_company_dummy,-484.31389,1.724131e-119
destination_dummy,-1330.899002,1.2584729999999999e-51
out_dep_time_dummy,-272.078254,3.926567e-21
in_arr_time_dummy,328.402755,1.732375e-13


The intercept term (const) has a coefficient of 1451.323468 and a p-value of 2.512953e-18. The tot_duration_seconds variable has a coefficient of 0.000935 and a p-value of 1.984531e-01. The hour_scrap variable has a coefficient of -6.262701 and a p-value of 1.679519e-03. The day_scrap variable has a coefficient of 32.058964 and a p-value of 5.421275e-29. The airline_company_dummy variable has a coefficient of -484.313890 and a p-value of 1.724131e-119. The destination_dummy variable has a coefficient of -1330.899002 and a p-value of 1.258473e-51. The out_dep_time_dummy variable has a coefficient of -272.078254 and a p-value of 3.926567e-21. The in_arr_time_dummy variable has a coefficient of 328.402755 and a p-value of 1.732375e-13.

#### 2.4. Observation regarding the collinearity

Compare the collinearity between the first model using X_1 and y_1 and the fourth one using X_4 and y_4.

##### 2.4.1 Observation for X_1 and y_1

In [28]:
# Fit a linear regression model
X_1 = sm.add_constant(X_1)
model = sm.OLS(y_1, X_1).fit()

# Check the p-values of the coefficients
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           ticket_price   R-squared:                       0.546
Model:                            OLS   Adj. R-squared:                  0.546
Method:                 Least Squares   F-statistic:                     1995.
Date:                Mon, 05 Jun 2023   Prob (F-statistic):               0.00
Time:                        07:29:47   Log-Likelihood:                -89209.
No. Observations:               11619   AIC:                         1.784e+05
Df Residuals:                   11611   BIC:                         1.785e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
tot_duration_seconds     -0.00

In [29]:
# Calculate the VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["feature"] = X_1.columns
vif_data["VIF"] = [variance_inflation_factor(X_1.values, i) for i in range(len(X_1.columns))]

# Check the VIF values
print(vif_data)

                 feature         VIF
0   tot_duration_seconds    2.393894
1             hour_scrap    1.008921
2              day_scrap    1.054136
3  airline_company_dummy    1.175422
4      destination_dummy         inf
5     out_dep_time_dummy    1.460508
6     out_arr_time_dummy         inf
7      in_dep_time_dummy  142.799773
8      in_arr_time_dummy    1.249996


  vif = 1. / (1. - r_squared_i)


The results of an ordinary least squares (OLS) regression model are presented. The model includes several independent variables and their corresponding coefficients, standard errors, t-values, p-values, and confidence intervals. The dependent variable is ticket_price. The model has an R-squared value of 0.546 and an adjusted R-squared value of 0.546, indicating that the model is able to explain approximately 54% of the variance in the dependent variable.

The tot_duration_seconds variable has a coefficient of -0.0011 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The hour_scrap variable has a coefficient of -2.8844 and a p-value of 0.006, indicating that it is also a statistically significant predictor in the model. The day_scrap variable has a coefficient of 16.5914 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The airline_company_dummy variable has a coefficient of -291.0034 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The destination_dummy variable has a coefficient of -603.8690 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The out_dep_time_dummy variable has a coefficient of -159.8531 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The out_arr_time_dummy variable has a coefficient of -603.8690 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The in_dep_time_dummy variable has a coefficient of 2103.6217 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The in_arr_time_dummy variable has a coefficient of 89.8229 and a p-value of 0.000, indicating that it is also a statistically significant predictor in the model.

The variance inflation factor (VIF) was calculated for each independent variable in the model to assess the degree of multicollinearity among the independent variables. Most of the VIF values were below 5, indicating that there was not a high degree of multicollinearity among these variables. However, the VIF values for destination_dummy and out_arr_time_dummy were infinite, suggesting that these variables were perfectly collinear with one or more of the other independent variables in the model.

##### 2.4.2 Observation for X_4 and y_4

In [30]:
# Fit a linear regression model
X_4 = sm.add_constant(X_4)
model = sm.OLS(y_4, X_4).fit()

# Check the p-values of the coefficients
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:           ticket_price   R-squared:                       0.546
Model:                            OLS   Adj. R-squared:                  0.546
Method:                 Least Squares   F-statistic:                     1995.
Date:                Mon, 05 Jun 2023   Prob (F-statistic):               0.00
Time:                        07:29:47   Log-Likelihood:                -89209.
No. Observations:               11619   AIC:                         1.784e+05
Df Residuals:                   11611   BIC:                         1.785e+05
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                  2103.62

In [31]:
# Calculate the VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["feature"] = X_4.columns
vif_data["VIF"] = [variance_inflation_factor(X_4.values, i) for i in range(len(X_4.columns))]

# Check the VIF values
print(vif_data)

                 feature         VIF
0                  const  142.799773
1   tot_duration_seconds    2.393894
2             hour_scrap    1.008921
3              day_scrap    1.054136
4  airline_company_dummy    1.175422
5      destination_dummy    2.411166
6     out_dep_time_dummy    1.460508
7      in_arr_time_dummy    1.249996


The results of an ordinary least squares (OLS) regression model are presented. The dependent variable is ticket_price. The model has an R-squared value of 0.546 and an adjusted R-squared value of 0.546, indicating that the model is able to explain approximately 54% of the variance in the dependent variable.

The intercept term (const) has a coefficient of 2103.6217 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The tot_duration_seconds variable has a coefficient of -0.0011 and a p-value of 0.000, indicating that it is also a statistically significant predictor in the model. The hour_scrap variable has a coefficient of -2.8844 and a p-value of 0.006, indicating that it is a statistically significant predictor in the model. The day_scrap variable has a coefficient of 16.5914 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The airline_company_dummy variable has a coefficient of -291.0034 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The destination_dummy variable has a coefficient of -1207.7380 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The out_dep_time_dummy variable has a coefficient of -159.8531 and a p-value of 0.000, indicating that it is a statistically significant predictor in the model. The in_arr_time_dummy variable has a coefficient of 89.8229 and a p-value of 0.000, indicating that it is also a statistically significant predictor in the model.

All VIF values were below 5, indicating that there was not a high degree of multicollinearity among these variables.

### 2.4 Conclusion

A multiple regression model was developed to predict the dependent variable ticket_price using several independent variables. The model was modified by removing two independent variables (out_arr_time_dummy and in_dep_time_dummy) and it was found that the values of several performance metrics (such as RMSE, MAE, MSE, and R-squared) remained constant. This suggests that the removed variables were not significant predictors in the model and did not contribute much to the model’s ability to explain the variance in the dependent variable.

VIF was calculated for the independent variables in both models. In the first model, most of the VIF values were below 5, indicating that there was not a high degree of multicollinearity among these variables. However, the VIF values for destination_dummy and out_arr_time_dummy were infinite, suggesting that these variables were perfectly collinear with one or more of the other independent variables in the model. In the fourth model, after removing the out_arr_time_dummy and in_dep_time_dummy variables, all remaining VIF values were below 5. This suggests that there was not a high degree of multicollinearity among these variables.

# Different X and Y

### Test predict lowest price

In [32]:
df_test = df[['ticket_price','day_of_week', 'hour_scrap']]

# One-hot encode the day_of_week column
df_encoded = pd.get_dummies(df_test, columns=['day_of_week'])

# Select the independent and dependent variables
X = df_encoded.drop('ticket_price', axis=1)
y = df_encoded['ticket_price']

def predict_lowest_price(X= X, y=y, model = LinearRegression()):
    """
    Fits a given model to the data in `X` and `y`, and predicts the lowest ticket price for all
    combinations of day of week and hour_scrap.

    Parameters:
    -----------
    X: A DataFrame containing the independent variables.
    y: A Series containing the dependent variable (ticket_price).
    model: The model used to make the prediction.
    
    Return:
    ------- 
    A string indicating the day of week and hour_scrap with the lowest predicted ticket price.
    """
    
    # Fit the model to the data
    model.fit(X, y)

    # Get all unique values in the day_of_week column
    days = X['day_of_week'].unique()

    # Create an array of all possible combinations of days and hours
    hours = np.arange(8, 22, 2)
    X_new = pd.DataFrame(np.array(np.meshgrid(days, hours)).T.reshape(-1, 2), columns=['day_of_week', 'hour_scrap'])

    # Save a copy of X_new before one-hot encoding
    X_new_original = X_new.copy()

    # One-hot encode the day_of_week column in X_new
    X_new = pd.get_dummies(X_new, columns=['day_of_week'])

    # Make predictions for all combinations
    predictions_new = model.predict(X_new)

    # Find the index of the minimum predicted price
    min_index = np.argmin(predictions_new)

    # Get the corresponding day and hour from X_new_original
    day = X_new_original.iloc[min_index]['day_of_week']
    hour = X_new_original.iloc[min_index]['hour_scrap']

    return f'The lowest predicted ticket price is on {day} at hour {hour}.'

## Predict Lowest Price

Given the first model above (cfr. Linear Regerssion: 2. The clean version), in this section, we try to predict the lowest price.

### Dependent variable

- y: A Series containing the dependent variable (ticket_price). This variable represents the ticket price for a flight. It can take any non-negative numerical value.

### Independent variables
- tot_duration_seconds: The total duration of the flight in seconds. This variable can take any non-negative numerical value.

- hour_scrap: The hour at which the data was scraped. This variable can take integer values between 0 and 23, representing the hour of the day.

- airline_company_dummy: A dummy variable indicating the airline company. This variable can take binary values 0 or 1, representing two different airline companies.

- destination_dummy: A dummy variable indicating the destination. This variable can take binary values 0 or 1, representing two different destinations.

- out_dep_time_dummy:  A dummy variable indicating the departure time from the origin. This variable can take binary values 0 or 1, 
representing two different departure times.

- in_arr_time_dummy: A dummy variable indicating the arrival time at the destination. This variable can take binary values 0 or 1,  
representing two different arrival times.

- day_of_week_*: Dummy variables indicating the day of the week. Each of these variables can take binary values 0 or 1,
representing day of the week on which the data was scraped. 

### 1. Set up

In [33]:
new = df[['tot_duration_seconds','ticket_price', 'hour_scrap','day_of_week','airline_company_dummy', 'destination_dummy','out_dep_time_dummy', 'in_arr_time_dummy']]
# One-hot encode the day_of_week column
df_encoded = pd.get_dummies(new, columns=['day_of_week'])

# Select the independent and dependent variables
X = df_encoded.drop('ticket_price', axis=1)
y = df_encoded['ticket_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

### 2. Which will be the predicted lowest ticket price for each combination of destination, airline and day of the week ?

In [34]:
def predict_price_df(X, model):
    """
    This function takes in a DataFrame `X` containing the independent variables and a fitted model `model`,
    and returns a pivoted DataFrame containing the predicted ticket price for all combinations of destination,
    airline company, day of week, and hour_scrap.

    Parameters:
    -----------
    X: A DataFrame containing the independent variables
    model: A fitted model

    Return:
    ------- 
    A pivoted DataFrame containing the predicted ticket price for all combinations of destination, airline company, 
    day of week, and hour_scrap.
    """
    # Create a new data frame with all features set to their average or median values
    new_df = pd.DataFrame(columns=X.columns)
    for col in X.columns:
        if col.startswith('day_of_week') or col == 'destination_dummy' or col == 'airline_company_dummy':
            new_df[col] = [0]
        else:
            new_df[col] = [X[col].median()]

    # Create an empty DataFrame to store the results
    results = pd.DataFrame(columns=['Destination', 'Airline', 'Day', 'Hour', 'Predicted Price'])

    # Vary the day_of_week, destination_dummy, airline_company_dummy and hour_scrap columns
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    destinations = ['Sao Paulo', 'New York']
    airlines = ['Lufthansa', 'Swiss']
    hours = [8, 10, 12, 14, 16, 18, 20, 22]
    for dest in [0, 1]:
        new_df['destination_dummy'] = [dest]
        for airline in [0, 1]:
            new_df['airline_company_dummy'] = [airline]
            for day in days:
                new_df[f'day_of_week_{day}'] = [1]
                for hour in hours:
                    new_df['hour_scrap'] = [hour]
                    pred = model.predict(new_df)
                    results = results.append({'Destination': destinations[dest], 'Airline': airlines[airline], 'Day': day, 'Hour': hour, 'Predicted Price': pred[0]}, ignore_index=True)
                new_df[f'day_of_week_{day}'] = [0]

    # Pivot the resulting DataFrame
    results_pivoted = results.pivot_table(index=['Destination', 'Airline', 'Hour'], columns='Day', values='Predicted Price')[days]

    return results_pivoted

In [35]:
predict_price_df(X, model)

  results = results.append({'Destination': destinations[dest], 'Airline': airlines[airline], 'Day': day, 'Hour': hour, 'Predicted Price': pred[0]}, ignore_index=True)
  results = results.append({'Destination': destinations[dest], 'Airline': airlines[airline], 'Day': day, 'Hour': hour, 'Predicted Price': pred[0]}, ignore_index=True)
  results = results.append({'Destination': destinations[dest], 'Airline': airlines[airline], 'Day': day, 'Hour': hour, 'Predicted Price': pred[0]}, ignore_index=True)
  results = results.append({'Destination': destinations[dest], 'Airline': airlines[airline], 'Day': day, 'Hour': hour, 'Predicted Price': pred[0]}, ignore_index=True)
  results = results.append({'Destination': destinations[dest], 'Airline': airlines[airline], 'Day': day, 'Hour': hour, 'Predicted Price': pred[0]}, ignore_index=True)
  results = results.append({'Destination': destinations[dest], 'Airline': airlines[airline], 'Day': day, 'Hour': hour, 'Predicted Price': pred[0]}, ignore_index=True

Unnamed: 0_level_0,Unnamed: 1_level_0,Day,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
Destination,Airline,Hour,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
New York,Lufthansa,8,1237.323617,1269.5494,1385.506762,1227.894439,1220.521089,1208.478191,1205.679103
New York,Lufthansa,10,1231.426113,1263.651896,1379.609258,1221.996935,1214.623585,1202.580687,1199.781599
New York,Lufthansa,12,1225.528609,1257.754391,1373.711754,1216.099431,1208.726081,1196.683183,1193.884095
New York,Lufthansa,14,1219.631104,1251.856887,1367.81425,1210.201926,1202.828576,1190.785679,1187.98659
New York,Lufthansa,16,1213.7336,1245.959383,1361.916745,1204.304422,1196.931072,1184.888174,1182.089086
New York,Lufthansa,18,1207.836096,1240.061879,1356.019241,1198.406918,1191.033568,1178.99067,1176.191582
New York,Lufthansa,20,1201.938592,1234.164375,1350.121737,1192.509414,1185.136064,1173.093166,1170.294078
New York,Lufthansa,22,1196.041088,1228.26687,1344.224233,1186.61191,1179.23856,1167.195662,1164.396574
New York,Swiss,8,950.777771,983.003554,1098.960916,941.348593,933.975243,921.932345,919.133257
New York,Swiss,10,944.880267,977.10605,1093.063412,935.451089,928.077739,916.034841,913.235753


### 3. Which will be the prdicted lowest price for a given destination and a particular day of the week (as inputs) ? 

In [36]:
def predict_lowest_price(destination, day, model):
    """
    Predicts the lowest price for a given destination and day of the week using a given model.

    Parameters:
    -----------
    destination: The destination as an integer representing the destination_dummy column value.
    day: The day of the week as strings ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday').
    model: The model used to make the prediction.
    
    Return:
    ------- 
    A tuple containing the predicted lowest price and the airline company that offers it (0 or 1).
    """
    # Create a new data frame with all features set to their average or median values
    new_df = pd.DataFrame(columns=X.columns)
    for col in X.columns:
        if col.startswith('day_of_week') or col == 'destination_dummy' or col == 'airline_company_dummy':
            new_df[col] = [0]
        else:
            new_df[col] = [X[col].median()]

    # Set the destination_dummy and day_of_week columns to the given values
    new_df['destination_dummy'] = [destination]
    new_df[f'day_of_week_{day}'] = [1]

    # Vary the airline_company_dummy column
    min_price = float('inf')
    best_airline = None
    for airline in [0, 1]:
        new_df['airline_company_dummy'] = [airline]
        pred = model.predict(new_df)
        if pred[0] < min_price:
            min_price = pred[0]
            best_airline = airline

    return min_price, best_airline

In [37]:
#  destination representing the destination_dummy, i.e., 1 = New York (or JFK)
destination = 1    
day = 'Friday'
min_price, best_airline = predict_lowest_price(destination, day, model)
print(f'For destination {destination} and day {day}, the lowest predicted ticket price is {min_price} with airline {best_airline}')

For destination 1 and day Friday, the lowest predicted ticket price is 910.3852260051267 with airline 1


### 4. Which will be the predicted the predicted lowest price and the best hour of the day to buy the ticket given a particular destination and day ? 

In [38]:
def predict_lowest_price(destination, day, model):
    """
    Predicts the lowest price for a given destination and day of the week using a given model.

    Parameters:
    -----------
    destination: The destination as an integer representing the destination_dummy column value.
    day: The day of the week as strings ('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday').
    model: The model used to make the prediction.
    
    Return:
    ------- 
    A tuple containing the predicted lowest price, the airline company that offers it (0 or 1),
    and the best hour to scrap (8, 10, 12, 14, 16, 18, 20 or 22).
    """
    # Create a new data frame with all features set to their average or median values
    new_df = pd.DataFrame(columns=X.columns)
    for col in X.columns:
        if col.startswith('day_of_week') or col == 'destination_dummy' or col == 'airline_company_dummy' or col == 'hour_scrap':
            new_df[col] = [0]
        else:
            new_df[col] = [X[col].median()]

    # Set the destination_dummy and day_of_week columns to the given values
    new_df['destination_dummy'] = [destination]
    new_df[f'day_of_week_{day}'] = [1]

    # Vary the hour_scrap and airline_company_dummy columns
    min_price = float('inf')
    best_airline = None
    best_hour = None
    for hour in [8, 10, 12, 14, 16, 18, 20, 22]:
        new_df['hour_scrap'] = [hour]
        for airline in [0, 1]:
            new_df['airline_company_dummy'] = [airline]
            pred = model.predict(new_df)
            if pred[0] < min_price:
                min_price = pred[0]
                best_airline = airline
                best_hour = hour

    return min_price, best_airline, best_hour

In [39]:
destination = 1     #  1 = New York (or JFK)
day = 'Monday'
min_price, best_airline, best_hour = predict_lowest_price(destination, day, model)
print(f'For destination {destination} and day {day}, the lowest predicted ticket price is {min_price} with airline {best_airline} at hour {best_hour}')

For destination 1 and day Monday, the lowest predicted ticket price is 909.4952414989305 with airline 1 at hour 22
