# Extraction
In this notebook both datasets are saved directly from their respective APIs for reproducibility and future scalability.

In [1]:
import os
from google.colab import drive
# Check if Google Drive is already mounted, mount otherwise
if not os.path.ismount('/content/drive'):
    print("Google Drive is not mounted yet. Mounting...")
    drive.mount('/content/drive')
else:
    print("Google Drive is already mounted.")

Google Drive is already mounted.


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/DataLife/modules')
import utils

In [3]:
import io
import zipfile
import requests
import pandas as pd
from datetime import datetime

In [4]:
timestamp_str = datetime.now().strftime("%Y%m%d_%H%M%S")

bronze_path = "/content/drive/MyDrive/DataLife/data/bronze"

airport_traffic_path = f"{bronze_path}/airport_traffic"
gdp_worldbank_path = f"{bronze_path}/gdp_worldbank"

## Extraction Airport Traffic

In [5]:
print("Downloading data...")

for year in range(2016, datetime.now().year + 1):
    url = f"https://www.eurocontrol.int/performance/data/download/csv/airport_traffic_{year}.csv"
    file = f"{year}_{timestamp_str}.csv"

    print(f"  Proccesing year {year} file ...")

    try:
        response = requests.get(url)

        if response.status_code == 200:
            df_year = pd.read_csv(io.BytesIO(response.content))
            utils.df_to_csv(df_year, file, airport_traffic_path)

        elif response.status_code == 404:
            print(f"File from year {year} not available.")
        else:
            print(f"Error {response.status_code}")

    except Exception as e:
        print(f"Error: {e}")

print("Download finished.")

Downloading data...
  Proccesing year 2016 file ...
File correctly saved at: /content/drive/MyDrive/DataLife/data/bronze/airport_traffic/2016_20260116_174025.csv
  Proccesing year 2017 file ...
File correctly saved at: /content/drive/MyDrive/DataLife/data/bronze/airport_traffic/2017_20260116_174025.csv
  Proccesing year 2018 file ...
File correctly saved at: /content/drive/MyDrive/DataLife/data/bronze/airport_traffic/2018_20260116_174025.csv
  Proccesing year 2019 file ...
File correctly saved at: /content/drive/MyDrive/DataLife/data/bronze/airport_traffic/2019_20260116_174025.csv
  Proccesing year 2020 file ...
File correctly saved at: /content/drive/MyDrive/DataLife/data/bronze/airport_traffic/2020_20260116_174025.csv
  Proccesing year 2021 file ...
File correctly saved at: /content/drive/MyDrive/DataLife/data/bronze/airport_traffic/2021_20260116_174025.csv
  Proccesing year 2022 file ...
File correctly saved at: /content/drive/MyDrive/DataLife/data/bronze/airport_traffic/2022_202601

## Extraction GDP

In [6]:
url2 = "https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.CD?downloadformat=csv"

In [8]:
print("Downloading data...")

try:
    response2 = requests.get(url2)
    if response2.status_code == 200:
        with zipfile.ZipFile(io.BytesIO(response2.content)) as z:
            nombre_archivo_csv = [f for f in z.namelist() if f.startswith("API_NY.GDP.MKTP.CD")][0]

            print(f"File found inside ZIP: {nombre_archivo_csv}")

            with z.open(nombre_archivo_csv) as f:
                df = pd.read_csv(f, skiprows=4)
                utils.df_to_csv(df, f"{timestamp_str}.csv", gdp_worldbank_path)

except Exception as e:
    print(f"      Error: {e}")

print("Download finished.")

Downloading data...
File found inside ZIP: API_NY.GDP.MKTP.CD_DS2_en_csv_v2_174428.csv
File correctly saved at: /content/drive/MyDrive/DataLife/data/bronze/gdp_worldbank/20260116_174025.csv
Download finished.
