# DATA CLEANING

## Setting-up

In [1]:
import pandas as pd
import datetime

import os

## Functions

In [2]:
# Not really efficient
 
# dates = ["20_may_04_PM", "20_may_06_PM", "20_may_08_PM", "20_may_10_PM"]
# dates.extend(["21_may_08_AM", "21_may_10_AM", "21_may_12_PM", "21_may_02_PM", "21_may_04_PM", "21_may_06_PM", "21_may_08_PM", "21_may_10_PM"])
# dates.extend(["22_may_08_AM", "22_may_10_AM", "22_may_12_PM", "22_may_02_PM", "22_may_04_PM", "22_may_06_PM", "22_may_08_PM", "22_may_10_PM"])
# dates.extend(["23_may_08_AM", "23_may_10_AM", "23_may_12_PM", "23_may_02_PM",  "23_may_06_PM", "23_may_08_PM", "23_may_10_PM"])
# dates.extend(["24_may_08_AM", "24_may_10_AM", "24_may_12_PM", "24_may_02_PM", "24_may_04_PM", "24_may_06_PM", "24_may_08_PM", "24_may_10_PM"])
# dates.extend(["25_may_08_AM", "25_may_10_AM", "25_may_12_PM"])

In [3]:
#  Get the name of all files from a specific folder authomatically with each run

all_nyc_files = []
all_sao_files = []

def get_files(destination: str="nyc", file_names: list = all_nyc_files) -> list:
    """
    This function retrieves the names of all .csv files in a specified directory.

    Args:
    -----
    destination (str): The destination city for which to retrieve the file names. 
    (Default sets to "nyc")
    
    file_names (list): The list to populate with the file names. 
    (Default sets to all_nyc_files)
    
    Return:
    -------
    list: A list of file names for all .csv files in the specified directory.
    """

    file_path = f'..\\webscraping\\bxl_to_{destination}'

    for file_name in os.listdir(file_path):
        # Split the file name into a base name and an extension
        base_name, extension = os.path.splitext(file_name)
        
        # Check if the file has a .csv extension
        if extension == '.csv':
            # Append the file name to the list
            file_names.append(file_name)
        
    return(file_names)
 
all_nyc_files = get_files()
all_sao_files = get_files(destination="sao", file_names=all_sao_files)

In [4]:
def read_csv_files(destination: str = "nyc", file_names: list[str] = all_nyc_files, result = None):
    """
    This function concatenates multiple CSV files into a single DataFrame. 
    If the result is None, it creates a list of DataFrames from the CSV files and concatenates them. 
    If the result is not None, it appends the DataFrames from the CSV files to the result.

    Args:
    -----
    result : The DataFrame to append the data to. If None, a new DataFrame is created.
 
    dates (list): The list of dates to use for the filenames of the CSV files.
    
    destination (str): The destination to use for the filenames of the CSV files.
 
    Return:
    -------
    DataFrame: The concatenated DataFrame.
    """
    if result is None:
        dfs = []
        for name in file_names:
            file_path = f"..\\webscraping\\bxl_to_{destination}\\{name}"
            df = pd.read_csv(file_path)
            dfs.append(df)
        result = pd.concat(dfs, axis=0, ignore_index=True)
    else:
        for name in file_names:
            filename = f"..\\webscraping\\bxl_to_{destination}\\booking_{name}.csv"
            df = pd.read_csv(filename)
            result = pd.concat([result, df], axis=0, ignore_index=True)
    return result


In [5]:
cur_time = datetime.datetime.now()
day = cur_time.day
hour = cur_time.strftime("%I")
hour_spe = cur_time.strftime("%p")

In [6]:
def transfomed_df(df):
    """
    This function transforms a DataFrame by splitting and renaming columns, 
    converting data types, and applying functions to columns.

    Args:
    -----
    df : The DataFrame to transform.
 
    Return:
    -------
    DataFrame: The transformed DataFrame.
    """

    air_cols = [col for col in df.columns if col.endswith('_airline_company')]
    df[air_cols] = df[air_cols].astype(str)
    df[air_cols] = df[air_cols].apply(lambda x: x.str.split(",").str[0])
    
    df['out_stop_num'] = df['out_stop_num'].str.split(' ').str[0].astype(int)
    df['in_stop_num'] = df['in_stop_num'].str.split(' ').str[0].astype(int)

    split_df = df.pop('price_ticket').str.rsplit(' ', n=1, expand=True)[0].rename('ticket_price')
    df = df.join(split_df)
    df['ticket_price'] = df['ticket_price'].str.replace(',', '.').str.replace(' ', '').astype(float)

    date_cols = [col for col in df.columns if col.endswith('_date')]
    year= '2023'
    for col in date_cols:
        df[col] = pd.to_datetime(df[col] + ' ' + year, format='%b %d %Y')
    
    time_cols = [col for col in df.columns if col.endswith('_time')]
    df[time_cols] = df[time_cols].apply(lambda x: pd.to_datetime(x, format='%I:%M %p').dt.strftime('%H:%M'))
    
    duration_cols = [col for col in df.columns if col.endswith('_duration')]
    df[duration_cols] = df[duration_cols].applymap(lambda x: pd.to_timedelta(x.replace('h', ' hours ').replace('m', ' min')))
    
    df['hour_scrap'] = df['hour_scrap'].replace(19, 18)
    
    # df = df.drop('in_airline_company', axis=1)
    # df = df.rename(columns={'out_airline_company': 'airline_company'})

    return df

## Clean data from bxl_to_nyc folder

In [7]:
df1 = read_csv_files()
df1 = transfomed_df(df1)
# df1 = df1.drop_duplicates()
df1 = df1.dropna()
df1 = df1.reset_index(drop=True)

## Clean data from bxl_to_sao

In [8]:
df2 = read_csv_files(destination="sao", file_names=all_sao_files)
df2 = transfomed_df(df2)
# df2 = df2.drop_duplicates()
df2 = df2.dropna()
df2 = df2.reset_index(drop=True)