# Data Modelling

In [1]:
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

### Get the Data

In [2]:
all_nyc_files = []
all_sao_files = []

def get_files(destination: str="nyc", file_names: list = all_nyc_files) -> list:
    """
    This function retrieves the names of all .csv files in a specified directory.

    Args:
    -----
    destination (str): The destination city for which to retrieve the file names. 
    (Default sets to "nyc")
    
    file_names (list): The list to populate with the file names. 
    (Default sets to all_nyc_files)
    
    Return:
    -------
    list: A list of file names for all .csv files in the specified directory.
    """

    file_path = f'..\\webscraping\\bxl_to_{destination}'

    for file_name in os.listdir(file_path):
        # Split the file name into a base name and an extension
        base_name, extension = os.path.splitext(file_name)
        
        # Check if the file has a .csv extension
        if extension == '.csv':
            # Append the file name to the list
            file_names.append(file_name)
        
    return(file_names)
 
all_nyc_files = get_files()
all_sao_files = get_files(destination="sao", file_names=all_sao_files)

In [3]:
def read_csv_files(destination: str = "nyc", file_names: list[str] = all_nyc_files, result = None):
    """
    This function concatenates multiple CSV files into a single DataFrame. 
    If the result is None, it creates a list of DataFrames from the CSV files and concatenates them. 
    If the result is not None, it appends the DataFrames from the CSV files to the result.

    Args:
    -----
    result : The DataFrame to append the data to. If None, a new DataFrame is created.
 
    dates (list): The list of dates to use for the filenames of the CSV files.
    
    destination (str): The destination to use for the filenames of the CSV files.
 
    Return:
    -------
    DataFrame: The concatenated DataFrame.
    """
    if result is None:
        dfs = []
        for name in file_names:
            file_path = f"..\\webscraping\\bxl_to_{destination}\\{name}"
            df = pd.read_csv(file_path)
            dfs.append(df)
        result = pd.concat(dfs, axis=0, ignore_index=True)
    else:
        for name in file_names:
            filename = f"..\\webscraping\\bxl_to_{destination}\\booking_{name}.csv"
            df = pd.read_csv(filename)
            result = pd.concat([result, df], axis=0, ignore_index=True)
    return result


In [4]:
def transfomed_df(df):
    """
    This function transforms a DataFrame by splitting and renaming columns, 
    converting data types, and applying functions to columns.

    Args:
    -----
    df : The DataFrame to transform.
 
    Return:
    -------
    DataFrame: The transformed DataFrame.
    """

    air_cols = [col for col in df.columns if col.endswith('_airline_company')]
    df[air_cols] = df[air_cols].astype(str)
    df[air_cols] = df[air_cols].apply(lambda x: x.str.split(",").str[0])
    
    df['out_stop_num'] = df['out_stop_num'].str.split(' ').str[0]
    df['in_stop_num'] = df['in_stop_num'].str.split(' ').str[0]
    df['out_stop_num'] = df['out_stop_num'].astype(float)
    df['in_stop_num'] = df['in_stop_num'].astype(float)

    df["tot_stop"] = df['out_stop_num'] + df['in_stop_num']


    split_df = df.pop('price_ticket').str.rsplit(' ', n=1, expand=True).rename(columns={0: 'ticket_price', 1: 'currency'})
    df = df.join(split_df)
    df['ticket_price'] = df['ticket_price'].str.replace(',', '.').str.replace(' ', '').astype(float)

    date_cols = [col for col in df.columns if col.endswith('_date')]
    year= '2023'
    for col in date_cols:
        df[col] = pd.to_datetime(df[col] + ' ' + year, format='%b %d %Y')
    
    time_cols = [col for col in df.columns if col.endswith('_time')]
    df[time_cols] = df[time_cols].apply(lambda x: pd.to_datetime(x, format='%I:%M %p').dt.strftime('%H:%M'))

    duration_cols = [col for col in df.columns if col.endswith('_duration')]
    df[duration_cols] = df[duration_cols].applymap(lambda x: pd.to_timedelta(x.replace('h', ' hours ').replace('m', ' min')))

    df['tot_duration'] = df["out_duration"] + df["in_duration"]
    
    df["airline_company"] = df["out_airline_company"]
    return df

In [5]:
# Get the dataframe for nyc
df1 = read_csv_files()
df1 = transfomed_df(df1)
df1 = df1.dropna()
df1 = df1.reset_index(drop=True)

# Get the dataframe for sao
df2 = read_csv_files(destination="sao", file_names=all_sao_files)
df2 = transfomed_df(df2)
df2 = df2.dropna()
df2 = df2.reset_index(drop=True)

# merge dataframe
df = pd.concat([df1, df2])

In [6]:
df

Unnamed: 0,out_airline_company,in_airline_company,dep_city,arr_city,out_dep_date,out_dep_time,out_duration,out_stop_num,out_arr_date,out_arr_time,...,in_stop_num,in_arr_date,in_arr_time,hour_scrap,day_scrap,tot_stop,ticket_price,currency,tot_duration,airline_company
0,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,09:35,0 days 11:25:00,1.0,2023-08-01,15:00,...,1.0,2023-08-16,07:35,16,20,2.0,1176.01,€,0 days 21:05:00,Lufthansa
1,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,13:15,0 days 12:50:00,1.0,2023-08-01,20:05,...,1.0,2023-08-16,15:50,16,20,2.0,923.39,€,1 days 05:10:00,Lufthansa
2,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,08:50,0 days 12:10:00,1.0,2023-08-01,15:00,...,1.0,2023-08-16,07:35,16,20,2.0,1176.01,€,0 days 21:50:00,Lufthansa
3,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,09:35,0 days 11:25:00,1.0,2023-08-01,15:00,...,1.0,2023-08-16,08:35,16,20,2.0,1176.01,€,0 days 22:05:00,Lufthansa
4,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,09:35,0 days 11:25:00,1.0,2023-08-01,15:00,...,1.0,2023-08-16,10:20,16,20,2.0,1176.01,€,0 days 22:15:00,Lufthansa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2151,Swiss,Swiss,BRU,GRU,2023-08-01,15:00,0 days 19:25:00,1.0,2023-08-02,05:25,...,1.0,2023-08-17,10:15,12,25,2.0,1799.69,€,2 days 05:55:00,Swiss
2152,Swiss,Swiss,BRU,GRU,2023-08-01,06:55,1 days 03:30:00,1.0,2023-08-02,05:25,...,1.0,2023-08-16,19:30,12,25,2.0,2049.17,€,1 days 23:15:00,Swiss
2153,Swiss,Swiss,BRU,GRU,2023-08-01,09:45,1 days 00:40:00,1.0,2023-08-02,05:25,...,1.0,2023-08-17,08:45,12,25,2.0,1799.69,€,2 days 09:40:00,Swiss
2154,Swiss,Swiss,BRU,GRU,2023-08-01,09:45,1 days 00:40:00,1.0,2023-08-02,05:25,...,1.0,2023-08-17,10:15,12,25,2.0,1799.69,€,2 days 11:10:00,Swiss


In [7]:
df1['airline_company_dummy'] = (df1['airline_company'] == 'Swiss').astype(int)

df1['tot_duration_seconds'] = df1['tot_duration'].dt.total_seconds()
# select the independent and dependent variables
X = df1[['tot_duration_seconds', 'hour_scrap', 'hour_scrap','ticket_price'  ]]
# X = df1[['tot_duration_seconds', 'airline_company_dummy']]
# y = df1['ticket_price']
y = df1[ 'airline_company']

# split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# create and fit the model using the training data
model = LogisticRegression()
model.fit(X_train, y_train)

# make predictions on the test data
y_pred = model.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.80


In [9]:
from sklearn.linear_model import LogisticRegressionCV

# create and fit the model using the training data
model = LogisticRegressionCV(Cs=10, cv=5)
model.fit(X_train, y_train)

# print the best C value found by cross-validation
print(f'Best C: {model.C_[0]:.2f}')

# make predictions on the test data
y_pred = model.predict(X_test)

# calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')

Best C: 0.00
Accuracy: 0.82


In [10]:
df1['tot_duration_seconds'] = df1['tot_duration'].dt.total_seconds()
# select the independent and dependent variables
X = df1[['tot_duration_seconds', 'hour_scrap', 'hour_scrap','airline_company_dummy']]
# X = df1[['tot_duration_seconds', 'airline_company_dummy']]
y = df1['ticket_price']

# split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model1 = LinearRegression()
model1.fit(X_train, y_train)

# make predictions on the test data
y_pred = model1.predict(X_test)

# calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 126.39


In [12]:
# create and fit the model using the training data
model2 = Lasso(alpha=1.0)
model2.fit(X_train, y_train)

# make predictions on the test data
y_pred = model2.predict(X_test)

# calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 126.41


In [13]:
# create and fit the model using the training data
model3 = LassoCV(cv=5)
model3.fit(X_train, y_train)

# print the best alpha value found by cross-validation
print(f'Best alpha: {model3.alpha_:.2f}')

# make predictions on the test data
y_pred = model3.predict(X_test)

# calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse:.2f}')

Best alpha: 5744.97
Root Mean Squared Error: 151.46


In [14]:
# create and fit the model using the training data
model4 = Ridge(alpha=5.0)
model4.fit(X_train, y_train)

# make predictions on the test data
y_pred = model4.predict(X_test)

# calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 126.40


In [15]:
# create scatter plots of ticket_price against each independent variable
# df1['airline_company_dummy'] = (df1['airline_company'] == 'Swiss').astype(int)

# fig, axs = plt.subplots(2, 3, figsize=(15, 10))
# axs[0, 0].scatter(df1['tot_duration'].dt.total_seconds(), df1['ticket_price'])
# axs[0, 0].set_xlabel('tot_duration')
# axs[0, 0].set_ylabel('ticket_price')
# axs[0, 1].scatter(df1['hour_scrap'], df1['ticket_price'])
# axs[0, 1].set_xlabel('hour_scrap')
# axs[0, 1].set_ylabel('ticket_price')
# axs[0, 2].scatter(df1['day_scrap'], df1['ticket_price'])
# axs[0, 2].set_xlabel('day_scrap')
# axs[0, 2].set_ylabel('ticket_price')
# axs[1, 0].scatter(df1['tot_stop'], df1['ticket_price'])
# axs[1, 0].set_xlabel('tot_stop')
# axs[1, 0].set_ylabel('ticket_price')
# axs[1, 1].scatter(df1['airline_company_dummy'], df1['ticket_price'])
# axs[1, 1].set_xlabel('airline_company_dummy')
# axs[1, 1].set_ylabel('ticket_price')

# plt.show()