# DATA CLEANING

## Setting-up

In [1]:
import pandas as pd
import datetime

## Functions

In [2]:
cur_time = datetime.datetime.now()
day = cur_time.day
hour = cur_time.strftime("%I")
hour_spe = cur_time.strftime("%p")

dates = ["20_may_04_PM", "20_may_06_PM"]
# date = f'{day}_may_{hour}_{hour_spe}'
# dates.append(date)

def read_csv_files(destination: str = "nyc", date_file: list[str] = dates, result=None):
    """
    This function concatenates multiple CSV files into a single DataFrame. 
    If the result is None, it creates a list of DataFrames from the CSV files and concatenates them. 
    If the result is not None, it appends the DataFrames from the CSV files to the result.

    Args:
    -----
    result : The DataFrame to append the data to. If None, a new DataFrame is created.
 
    dates (list): The list of dates to use for the filenames of the CSV files.
    
    destination (str): The destination to use for the filenames of the CSV files.
 
    Return:
    -------
    DataFrame: The concatenated DataFrame.
    """
    if result is None:
        dfs = []
        for date in date_file:
            filename = f"..\\webscraping\\bxl_to_{destination}\\booking_{date}.csv"
            df = pd.read_csv(filename)
            dfs.append(df)
        result = pd.concat(dfs, axis=0, ignore_index=True)
    else:
        for date in date_file:
            filename = f"..\\webscraping\\bxl_to_{destination}\\booking_{date}.csv"
            df = pd.read_csv(filename)
            result = pd.concat([result, df], axis=0, ignore_index=True)
    return result

In [3]:
dates

['20_may_04_PM', '20_may_06_PM']

In [4]:
def transfomed_df(df):
    """
    This function transforms a DataFrame by splitting and renaming columns, 
    converting data types, and applying functions to columns.

    Args:
    -----
    df : The DataFrame to transform.
 
    Return:
    -------
    DataFrame: The transformed DataFrame.
    """

    air_cols = [col for col in df.columns if col.endswith('_airline_company')]
    df[air_cols] = df[air_cols].astype(str)
    df[air_cols] = df[air_cols].apply(lambda x: x.str.split(",").str[0])
    
    df['out_stop_num'] = df['out_stop_num'].str.split(' ').str[0]
    df['in_stop_num'] = df['in_stop_num'].str.split(' ').str[0]

    split_df = df.pop('price_ticket').str.rsplit(' ', n=1, expand=True).rename(columns={0: 'ticket_price', 1: 'currency'})
    df = df.join(split_df)
    df['ticket_price'] = df['ticket_price'].str.replace(',', '.').str.replace(' ', '').astype(float)

    date_cols = [col for col in df.columns if col.endswith('_date')]
    year= '2023'
    for col in date_cols:
        df[col] = pd.to_datetime(df[col] + ' ' + year, format='%b %d %Y')
    
    time_cols = [col for col in df.columns if col.endswith('_time')]
    df[time_cols] = df[time_cols].apply(lambda x: pd.to_datetime(x, format='%I:%M %p').dt.strftime('%H:%M'))

    duration_cols = [col for col in df.columns if col.endswith('_duration')]
    df[duration_cols] = df[duration_cols].applymap(lambda x: pd.to_timedelta(x.replace('h', ' hours ').replace('m', ' min')))

    return df

## Clean data from bxl_to_nyc folder

In [5]:
df1 = read_csv_files()
df1 = transfomed_df(df1)
df1

Unnamed: 0,out_airline_company,in_airline_company,dep_city,arr_city,out_dep_date,out_dep_time,out_duration,out_stop_num,out_arr_date,out_arr_time,in_dep_date,in_dep_time,in_duration,in_stop_num,in_arr_date,in_arr_time,hour_scrap,day_scrap,ticket_price,currency
0,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,09:35,0 days 11:25:00,1,2023-08-01,15:00,2023-08-15,15:55,0 days 09:40:00,1,2023-08-16,07:35,16,20,1176.01,€
1,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,13:15,0 days 12:50:00,1,2023-08-01,20:05,2023-08-15,17:30,0 days 16:20:00,1,2023-08-16,15:50,16,20,923.39,€
2,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,08:50,0 days 12:10:00,1,2023-08-01,15:00,2023-08-15,15:55,0 days 09:40:00,1,2023-08-16,07:35,16,20,1176.01,€
3,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,09:35,0 days 11:25:00,1,2023-08-01,15:00,2023-08-15,15:55,0 days 10:40:00,1,2023-08-16,08:35,16,20,1176.01,€
4,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,09:35,0 days 11:25:00,1,2023-08-01,15:00,2023-08-15,17:30,0 days 10:50:00,1,2023-08-16,10:20,16,20,1176.01,€
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,Swiss,Swiss,BRU,JFK,2023-08-01,20:15,1 days 02:00:00,1,2023-08-02,16:15,2023-08-15,19:25,0 days 21:05:00,1,2023-08-16,22:30,18,20,780.72,€
116,Swiss,Swiss,BRU,JFK,2023-08-01,20:15,1 days 02:00:00,1,2023-08-02,16:15,2023-08-15,16:30,0 days 21:00:00,1,2023-08-16,19:30,18,20,826.28,€
117,Swiss,Swiss,BRU,JFK,2023-08-01,20:15,1 days 02:00:00,1,2023-08-02,16:15,2023-08-15,19:25,1 days 06:55:00,1,2023-08-17,08:20,18,20,786.88,€
118,Swiss,Swiss,BRU,JFK,2023-08-01,20:15,1 days 02:00:00,1,2023-08-02,16:15,2023-08-15,19:25,1 days 08:10:00,1,2023-08-17,09:35,18,20,786.88,€


In [9]:
df1 = df1.drop_duplicates()
df1 = df1.dropna()
df1 = df1.reset_index(drop=True)
df1

Unnamed: 0,out_airline_company,in_airline_company,dep_city,arr_city,out_dep_date,out_dep_time,out_duration,out_stop_num,out_arr_date,out_arr_time,in_dep_date,in_dep_time,in_duration,in_stop_num,in_arr_date,in_arr_time,hour_scrap,day_scrap,ticket_price,currency
0,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,09:35,0 days 11:25:00,1,2023-08-01,15:00,2023-08-15,15:55,0 days 09:40:00,1,2023-08-16,07:35,16,20,1176.01,€
1,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,13:15,0 days 12:50:00,1,2023-08-01,20:05,2023-08-15,17:30,0 days 16:20:00,1,2023-08-16,15:50,16,20,923.39,€
2,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,08:50,0 days 12:10:00,1,2023-08-01,15:00,2023-08-15,15:55,0 days 09:40:00,1,2023-08-16,07:35,16,20,1176.01,€
3,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,09:35,0 days 11:25:00,1,2023-08-01,15:00,2023-08-15,15:55,0 days 10:40:00,1,2023-08-16,08:35,16,20,1176.01,€
4,Lufthansa,Lufthansa,BRU,JFK,2023-08-01,09:35,0 days 11:25:00,1,2023-08-01,15:00,2023-08-15,17:30,0 days 10:50:00,1,2023-08-16,10:20,16,20,1176.01,€
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,Swiss,Swiss,BRU,JFK,2023-08-01,20:15,1 days 02:00:00,1,2023-08-02,16:15,2023-08-15,20:55,1 days 07:20:00,1,2023-08-17,10:15,18,20,786.88,€
90,Swiss,Swiss,BRU,JFK,2023-08-01,20:15,1 days 02:00:00,1,2023-08-02,16:15,2023-08-15,19:25,0 days 17:00:00,1,2023-08-16,18:25,18,20,780.72,€
91,Swiss,Swiss,BRU,JFK,2023-08-01,09:45,0 days 12:30:00,1,2023-08-01,16:15,2023-08-15,16:30,0 days 15:45:00,1,2023-08-16,14:15,18,20,780.72,€
92,Swiss,Swiss,BRU,JFK,2023-08-01,09:45,0 days 12:30:00,1,2023-08-01,16:15,2023-08-15,20:55,0 days 16:35:00,1,2023-08-16,19:30,18,20,1074.28,€


## Clean data from bxl_to_sao

In [8]:
df2 = read_csv_files(destination="sao")
df2 = transfomed_df(df2)
df2 = df2.drop_duplicates()
df2 = df2.dropna()
df2 = df2.reset_index(drop=True)
df2

Unnamed: 0,out_airline_company,in_airline_company,dep_city,arr_city,out_dep_date,out_dep_time,out_duration,out_stop_num,out_arr_date,out_arr_time,in_dep_date,in_dep_time,in_duration,in_stop_num,in_arr_date,in_arr_time,hour_scrap,day_scrap,ticket_price,currency
0,Lufthansa,Lufthansa,BRU,GRU,2023-08-01,20:05,0 days 13:45:00,1,2023-08-02,04:50,2023-08-15,18:15,0 days 14:40:00,1,2023-08-16,13:55,16,20,2115.75,€
1,Lufthansa,Lufthansa,BRU,GRU,2023-08-01,20:05,0 days 13:45:00,1,2023-08-02,04:50,2023-08-15,18:15,0 days 18:05:00,1,2023-08-16,17:20,16,20,1948.60,€
2,Lufthansa,Lufthansa,BRU,GRU,2023-08-01,19:15,0 days 14:35:00,1,2023-08-02,04:50,2023-08-15,18:15,0 days 18:05:00,1,2023-08-16,17:20,16,20,1948.60,€
3,Lufthansa,Lufthansa,BRU,GRU,2023-08-01,20:05,0 days 13:45:00,1,2023-08-02,04:50,2023-08-15,18:15,0 days 20:05:00,1,2023-08-16,19:20,16,20,1948.60,€
4,Lufthansa,Lufthansa,BRU,GRU,2023-08-01,18:05,0 days 15:45:00,1,2023-08-02,04:50,2023-08-15,18:15,0 days 18:05:00,1,2023-08-16,17:20,16,20,1948.60,€
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,Swiss,Swiss,BRU,GRU,2023-08-01,15:00,0 days 19:25:00,1,2023-08-02,05:25,2023-08-15,18:45,0 days 22:45:00,2,2023-08-16,22:30,18,20,2120.29,€
87,Swiss,Swiss,BRU,GRU,2023-08-01,09:45,1 days 00:40:00,1,2023-08-02,05:25,2023-08-15,18:45,0 days 18:40:00,2,2023-08-16,18:25,18,20,2094.42,€
88,Swiss,Swiss,BRU,GRU,2023-08-01,09:45,1 days 00:40:00,1,2023-08-02,05:25,2023-08-15,18:45,1 days 09:00:00,1,2023-08-17,08:45,18,20,1799.72,€
89,Swiss,Swiss,BRU,GRU,2023-08-01,15:00,0 days 19:25:00,1,2023-08-02,05:25,2023-08-15,18:45,1 days 10:30:00,1,2023-08-17,10:15,18,20,1799.72,€
