In [4]:
import requests
import re
import json
import pandas as pd
import time
from urllib.parse import urlparse
from bs4 import BeautifulSoup

In [5]:
# 1. Obtener el HTML de la página del ranking
url = "https://www.theglobaleconomy.com/rankings/gdp_current_local_currency/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36"
}

response = requests.get(url, headers=headers)
response.raise_for_status()  # Lanza error si la petición falla

In [6]:
# 2. Parsear HTML con BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

In [7]:
# 3. Buscar la tabla principal y extraer los links
table = soup.find("table")
country_links = []

if table:
    rows = table.find_all("tr")
    for row in rows:
        cell = row.find("td")
        if cell and cell.a:
            relative_link = cell.a["href"]
            full_url = f"https://www.theglobaleconomy.com{relative_link}"
            country_links.append(full_url)

In [8]:
# 4. Mostrar resultado
print("Links encontrados:", len(country_links))
for link in country_links:
    print(link)

Links encontrados: 125
https://www.theglobaleconomy.com/Afghanistan/gdp_current_local_currency/
https://www.theglobaleconomy.com/Albania/gdp_current_local_currency/
https://www.theglobaleconomy.com/Algeria/gdp_current_local_currency/
https://www.theglobaleconomy.com/Argentina/gdp_current_local_currency/
https://www.theglobaleconomy.com/Armenia/gdp_current_local_currency/
https://www.theglobaleconomy.com/Australia/gdp_current_local_currency/
https://www.theglobaleconomy.com/Austria/gdp_current_local_currency/
https://www.theglobaleconomy.com/Azerbaijan/gdp_current_local_currency/
https://www.theglobaleconomy.com/Bahamas/gdp_current_local_currency/
https://www.theglobaleconomy.com/Bahrain/gdp_current_local_currency/
https://www.theglobaleconomy.com/Bangladesh/gdp_current_local_currency/
https://www.theglobaleconomy.com/Belarus/gdp_current_local_currency/
https://www.theglobaleconomy.com/Belgium/gdp_current_local_currency/
https://www.theglobaleconomy.com/Bermuda/gdp_current_local_currenc

In [9]:
# DataFrame maestro
master_df = pd.DataFrame()

In [10]:
pattern = r'arrayToDataTable\((\[\[.*?\]\])\)'

In [11]:
# Cargar los países válidos desde el archivo parquet
df_countries = pd.read_parquet("countries_with_hpi.parquet")
valid_countries = set(df_countries["country"].unique())

In [14]:
print(len(valid_countries))

57


In [15]:
# other config

master_df = pd.DataFrame()

pattern = r'arrayToDataTable\((\[\[.*?\]\])\)'

sleep_duration = 5

In [16]:
start_time = time.time()
for url in country_links:
    # Extraer país desde el URL
    country = urlparse(url).path.strip("/").split("/")[0]

    # Verificar si el país está en la lista válida
    if country not in valid_countries:
        print(f"⏭️ País {country} no está en la lista, se omite.")
        continue

    try:
        response = requests.get(url, headers=headers)
        html = response.text

        match = re.search(pattern, html, re.DOTALL)
        if match:
            data_str = match.group(1)
            data = json.loads(data_str)

            df = pd.DataFrame(data[1:], columns=data[0])
            df.columns = ["Period", "Value"]
            df["Country"] = country

            df_pivot = df.pivot(index="Period", columns="Country", values="Value")
            master_df = pd.merge(master_df, df_pivot, how="outer", left_index=True, right_index=True)

            print(f"✅ Datos extraídos correctamente para {country}")
        else:
            print(f"⚠️ No se encontraron datos en {url}")
    except Exception as e:
        print(f"❌ Error procesando {url}: {e}")
    
    time.sleep(sleep_duration)

end_time = time.time()
total_sleep_time = len(country_links) * sleep_duration
effective_time = end_time - start_time - total_sleep_time

print(f"⏳ Tiempo total (incluyendo esperas): {end_time - start_time:.2f} s")
print(f"😴 Tiempo en espera: {total_sleep_time:.2f} s")
print(f"⚡ Tiempo efectivo de ejecución: {effective_time:.2f} s")

⏭️ País Afghanistan no está en la lista, se omite.
⏭️ País Albania no está en la lista, se omite.
⏭️ País Algeria no está en la lista, se omite.
⏭️ País Argentina no está en la lista, se omite.
⏭️ País Armenia no está en la lista, se omite.
✅ Datos extraídos correctamente para Australia
✅ Datos extraídos correctamente para Austria
⏭️ País Azerbaijan no está en la lista, se omite.
⏭️ País Bahamas no está en la lista, se omite.
⏭️ País Bahrain no está en la lista, se omite.
⏭️ País Bangladesh no está en la lista, se omite.
⏭️ País Belarus no está en la lista, se omite.
✅ Datos extraídos correctamente para Belgium
⏭️ País Bermuda no está en la lista, se omite.
⏭️ País Bolivia no está en la lista, se omite.
⏭️ País Bosnia-and-Herzegovina no está en la lista, se omite.
⏭️ País Botswana no está en la lista, se omite.
✅ Datos extraídos correctamente para Brazil
⏭️ País Brunei no está en la lista, se omite.
✅ Datos extraídos correctamente para Bulgaria
⏭️ País Burkina-Faso no está en la lista,

In [18]:
# include paraguay

response = requests.get("https://www.theglobaleconomy.com/Paraguay/gdp_current_local_currency/", headers=headers)
html = response.text

match = re.search(pattern, html, re.DOTALL)
if match:
    data_str = match.group(1)
    data = json.loads(data_str)

    df = pd.DataFrame(data[1:], columns=data[0])
    df.columns = ["Period", "Value"]
    df["Country"] = country

    df_pivot = df.pivot(index="Period", columns="Country", values="Value")
    master_df = pd.merge(master_df, df_pivot, how="outer", left_index=True, right_index=True)

    print(f"✅ Datos extraídos correctamente para {country}")

✅ Datos extraídos correctamente para Zambia


In [19]:
# Resetear el índice para separar "Period"
master_df = master_df.reset_index()

In [20]:
# Separar "Q1 1990" en "Quarter" y "Year"
master_df[["Quarter", "Year"]] = master_df["Period"].str.extract(r"Q(\d)\s+(\d{4})").astype(int)

In [21]:
# Reordenar las columnas: primero Quarter y Year
cols = ["Quarter", "Year"] + [col for col in master_df.columns if col not in ["Period", "Quarter", "Year"]]
master_df = master_df[cols]

In [22]:
# Ordenar por Year y Quarter
master_df = master_df.sort_values(by=["Year", "Quarter"]).reset_index(drop=True)

In [23]:
master_df.head()

Unnamed: 0,Quarter,Year,Australia,Austria,Belgium,Brazil,Bulgaria,Canada,Chile,China,...,South-Africa,South-Korea,Spain,Sweden,Switzerland,Thailand,Turkey,USA,United-Kingdom,Zambia
0,1,1960,3.93,,,,,,,,...,,49.06,,,,,,135.83,6.43,
1,2,1960,4.18,,,,,,,,...,,67.98,,,,,,135.68,6.48,
2,3,1960,4.34,,,,,,,,...,,61.0,,,,,,136.5,6.58,
3,4,1960,4.87,,,,,,,,...,,71.79,,,,,,135.28,6.71,
4,1,1961,4.23,,,,,9.51,,,...,,61.95,,,,,,136.48,6.88,


In [24]:
master_df.tail()

Unnamed: 0,Quarter,Year,Australia,Austria,Belgium,Brazil,Bulgaria,Canada,Chile,China,...,South-Africa,South-Korea,Spain,Sweden,Switzerland,Thailand,Turkey,USA,United-Kingdom,Zambia
257,2,2024,678.35,120.93,154.65,2921.23,48.14,753.92,75091.54,32883.76,...,1827.69,636205.4,400.31,1618.85,206.5,4520.55,9920.84,7254.18,699.17,81953.81
258,3,2024,676.79,119.0,147.88,2989.91,53.77,800.82,75344.2,34175.8,...,1859.09,643531.6,393.31,1548.65,206.6,4615.84,11915.59,7343.73,716.05,80770.75
259,4,2024,715.14,126.77,163.78,3080.37,57.59,798.84,84662.46,37372.62,...,1928.18,668933.8,420.44,1698.81,210.09,4823.23,12704.05,7430.98,735.14,89086.69
260,1,2025,675.42,119.38,152.24,3019.58,45.62,757.58,82042.34,31875.8,...,1798.8,622771.1,397.66,1552.81,207.83,4744.03,12125.17,7490.5,744.73,92478.27
261,2,2025,,,,,,,,,...,,,423.04,,,,,7582.78,,


In [25]:
master_df.shape

(262, 60)

In [26]:
master_df.dtypes

Quarter             int64
Year                int64
Australia         float64
Austria           float64
Belgium           float64
Brazil            float64
Bulgaria          float64
Canada            float64
Chile             float64
China             float64
Colombia          float64
Croatia           float64
Cyprus            float64
Czech-Republic    float64
Denmark           float64
Estonia           float64
Finland           float64
France            float64
Germany           float64
Greece            float64
Hong-Kong         float64
Hungary           float64
Iceland           float64
India             float64
Indonesia         float64
Ireland           float64
Israel            float64
Italy             float64
Japan             float64
Latvia            float64
Lithuania         float64
Luxembourg        float64
Macedonia         float64
Malaysia          float64
Malta             float64
Mexico            float64
Morocco           float64
Netherlands       float64
New-Zealand 

In [27]:
master_df.to_parquet("gdp_values.parquet", index=False)