# Transformation

## Configuration

In this notebook various transformations are done to the datasets in order to clean them.

In [1]:
import os
from google.colab import drive
# Check if Google Drive is already mounted, mount otherwise
if not os.path.ismount('/content/drive'):
    print("Google Drive is not mounted yet. Mounting...")
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")

Google Drive is not mounted yet. Mounting...
Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/DataLife/modules')
import utils

In [3]:
import re
import glob
import pandas as pd
from datetime import datetime

In [4]:
bronze_path = "/content/drive/MyDrive/DataLife/data/bronze"
silver_path = "/content/drive/MyDrive/DataLife/data/silver"

airport_bronze_path = f"{bronze_path}/airport_traffic"
gdp_bronze_path = f"{bronze_path}/gdp_worldbank"

## Transformation Airport Traffic

### Search Updated Files

In [5]:
all_files = glob.glob(f"{airport_bronze_path}/*.csv", recursive=True)

years = set()
timestamps = set()

for file in all_files:
    filename = os.path.basename(file)

    match = re.search(r"(\d{4})_(\d{8}_\d{6})\.csv", filename)
    if match:
        years.add(match.group(1))
        timestamps.add(match.group(2))

timestamps = sorted(list(timestamps))
years = sorted(list(years))

updated_files = []

for year in years:
    updated_files.append(f"{year}_{timestamps[-1]}.csv")

### Concatenate Updated Files into one csv

In [6]:
df_airport_cleaned = pd.DataFrame()

for file in updated_files:
    print(f"Processing file: {file}")
    df_aux = pd.read_csv(f"{airport_bronze_path}/{file}")
    df_airport_cleaned = pd.concat([df_airport_cleaned, df_aux], ignore_index=True)
print("Process terminated successfully.")

Processing file: 2016_20260116_174025.csv
Processing file: 2017_20260116_174025.csv
Processing file: 2018_20260116_174025.csv
Processing file: 2019_20260116_174025.csv
Processing file: 2020_20260116_174025.csv
Processing file: 2021_20260116_174025.csv
Processing file: 2022_20260116_174025.csv
Processing file: 2023_20260116_174025.csv
Processing file: 2024_20260116_174025.csv
Processing file: 2025_20260116_174025.csv
Process terminated successfully.


In [7]:
utils.df_to_csv(df_airport_cleaned, f"fact_airport_traffic.csv", silver_path)

File correctly saved at: /content/drive/MyDrive/DataLife/data/silver/fact_airport_traffic.csv


## Transformation GDP WorldBank

GDP information from 1960-2015 period is unnecessary as information for flights from that period was not found, so it is removed.

In [8]:
all_files = sorted(glob.glob(f"{gdp_bronze_path}/*.csv", recursive=True))
filename = all_files[-1]

df_gdp = pd.read_csv(filename)

In [9]:
df_gdp.drop(columns=["Country Code", "Indicator Name", "Indicator Code", "Unnamed: 69"], inplace=True)
for i in range(1960, 2016):
  df_gdp.drop(columns=[str(i)], inplace=True)

In [10]:
utils.df_to_csv(df_gdp, f"fact_gdp_worldbank.csv", silver_path)

File correctly saved at: /content/drive/MyDrive/DataLife/data/silver/fact_gdp_worldbank.csv
