In [7]:
import requests
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
from datetime import datetime, timedelta
import time

# Load clean match dataset
df = pd.read_csv('../data-processed/matches_clean.csv')
# Fix specific encoding errors in city names
df['city_name'] = df['city_name'].replace('ÅŒita', 'Ōita')
df['city_name'] = df['city_name'].replace('BrasÃlia', 'Brasília')
df['city_name'] = df['city_name'].replace('CuiabÃ¡', 'Cuiabá')
df['city_name'] = df['city_name'].replace('SÃ£o Paulo', 'São Paulo')

In [9]:
# ------------------------------------------------------
# 2. Automated Geocoding
# ------------------------------------------------------
# Identify all the cities and fetch their coordinates and time zones
unique_locations = df[['city_name', 'country_name']].drop_duplicates()
city_coords = {}

# Force the correct data for cities the API misses or gets wrong
manual_overrides = {
    'Lusail': {'lat': 25.4180, 'lon': 51.4902, 'timezone': 'Asia/Qatar'},
    'Miyagi': {'lat': 38.3291, 'lon': 140.9825, 'timezone': 'Asia/Tokyo'}, 
}

print("Fetching coordinates for cities...")

# Setup for geocoding
geocoding_url = "https://geocoding-api.open-meteo.com/v1/search"

for index, row in unique_locations.iterrows():
    city = row['city_name']
    
    # 1. Check if manual override first
    if city in manual_overrides:
        city_coords[city] = manual_overrides[city]
        print(f"Found {city} (Manual): {city_coords[city]}")
        continue
    
    # 2. If not, ask API
    if city in city_coords:
        continue
        
    try:
        params = {"name": city, "count": 5, "language": "en", "format": "json"}
        response = requests.get(geocoding_url, params=params)
        results = response.json().get("results", [])
        
        if results:
            best_match = results[0]
            city_coords[city] = {
                'lat': best_match['latitude'],
                'lon': best_match['longitude'],
                'timezone': best_match['timezone']
            }
            # Updated print statement to show Lat/Lon
            print(f"Found {city}: {city_coords[city]}")
        else:
            print(f"Could not find coordinates for {city}")
            
    except Exception as e:
        print(f"Error Geocoding {city}: {e}")
    
    time.sleep(0.2)

Fetching coordinates for cities...
Found Seoul: {'lat': 37.566, 'lon': 126.9784, 'timezone': 'Asia/Seoul'}
Found Niigata: {'lat': 37.92259, 'lon': 139.04124, 'timezone': 'Asia/Tokyo'}
Found Ulsan: {'lat': 35.53722, 'lon': 129.31667, 'timezone': 'Asia/Seoul'}
Found Sapporo: {'lat': 43.06667, 'lon': 141.35, 'timezone': 'Asia/Tokyo'}
Found Ibaraki: {'lat': 34.81641, 'lon': 135.56828, 'timezone': 'Asia/Tokyo'}
Found Busan: {'lat': 35.10168, 'lon': 129.03004, 'timezone': 'Asia/Seoul'}
Found Saitama: {'lat': 35.90807, 'lon': 139.65657, 'timezone': 'Asia/Tokyo'}
Found Gwangju: {'lat': 35.15472, 'lon': 126.91556, 'timezone': 'Asia/Seoul'}
Found Kobe: {'lat': 34.6913, 'lon': 135.183, 'timezone': 'Asia/Tokyo'}
Found Suwon: {'lat': 37.29111, 'lon': 127.00889, 'timezone': 'Asia/Seoul'}
Found Daegu: {'lat': 35.87028, 'lon': 128.59111, 'timezone': 'Asia/Seoul'}
Found Jeonju: {'lat': 35.82194, 'lon': 127.14889, 'timezone': 'Asia/Seoul'}
Found Seogwipo: {'lat': 33.25333, 'lon': 126.56181, 'timezone': 

In [10]:
# ------------------------------------------------------
# 3. Setup Open-Meteo API
# ------------------------------------------------------
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)
url = "https://archive-api.open-meteo.com/v1/archive"

In [12]:
# ------------------------------------------------------
# 4. Lists to store new data
# ------------------------------------------------------
weather_data = {
    'temp0': [], 'temp1': [], 'temp2': [],
    'atemp0': [], 'atemp1': [], 'atemp2': [],
    'humid0': [], 'humid1': [], 'humid2': [],
    'wind0': [], 'wind1': [], 'wind2': []
}

In [13]:
# ------------------------------------------------------
# 5. Iterate through matches and fetch data
# ------------------------------------------------------
print(f"Processing {len(df)} matches...")

for index, row in df.iterrows():
    city = row['city_name']
    match_date = row['match_date'] # Format YYYY-MM-DD
    match_time = row['match_time'] # Format HH:MM
    
    # Progress indicator
    if index % 10 == 0:
        print(f"Processing row {index}/{len(df)}: {city} on {match_date}")

    if city not in city_coords:
        print(f"Warning: Coordinates for {city} not found.")
        for key in weather_data: weather_data[key].append(None)
        continue
        
    loc = city_coords[city]
    
    # Prepare API params
    # Request data for the match day and the next day to handle matches near midnight
    date_obj = datetime.strptime(match_date, '%Y-%m-%d')
    end_date_obj = date_obj + timedelta(days=1)
    
    params = {
        "latitude": loc['lat'],
        "longitude": loc['lon'],
        "start_date": match_date,
        "end_date": end_date_obj.strftime('%Y-%m-%d'),
        "hourly": ["temperature_2m", "relative_humidity_2m", "apparent_temperature", "wind_speed_10m"],
        "timezone": loc['timezone']
    }
    
    try:
        responses = openmeteo.weather_api(url, params=params)
        response = responses[0]
        
        # Get hourly data
        hourly = response.Hourly()
        
        # Extract variables as numpy arrays
        # Note: Order matches the "hourly" list in params
        temp_arr = hourly.Variables(0).ValuesAsNumpy()
        humid_arr = hourly.Variables(1).ValuesAsNumpy()
        atemp_arr = hourly.Variables(2).ValuesAsNumpy()
        wind_arr = hourly.Variables(3).ValuesAsNumpy()
        
        # Calculate Index
        # API returns data starting at 00:00 local time of start_date.
        # Thus, index 0 = 00:00, index 10 = 10:00.
        match_hour = int(match_time.split(':')[0])
        
        # Want hour, hour+1, hour+2
        indices = [match_hour, match_hour + 1, match_hour + 2]
        
        # Append data
        weather_data['temp0'].append(temp_arr[indices[0]])
        weather_data['temp1'].append(temp_arr[indices[1]])
        weather_data['temp2'].append(temp_arr[indices[2]])
        
        weather_data['humid0'].append(humid_arr[indices[0]])
        weather_data['humid1'].append(humid_arr[indices[1]])
        weather_data['humid2'].append(humid_arr[indices[2]])
        
        weather_data['atemp0'].append(atemp_arr[indices[0]])
        weather_data['atemp1'].append(atemp_arr[indices[1]])
        weather_data['atemp2'].append(atemp_arr[indices[2]])
        
        weather_data['wind0'].append(wind_arr[indices[0]])
        weather_data['wind1'].append(wind_arr[indices[1]])
        weather_data['wind2'].append(wind_arr[indices[2]])
        
        time.sleep(0.2)
        
    except Exception as e:
        print(f"Error fetching data for {city} on {match_date}: {e}")
        for key in weather_data: weather_data[key].append(None)

Processing 384 matches...
Processing row 0/384: Seoul on 2002-05-31
Processing row 10/384: Ulsan on 2002-06-03
Processing row 20/384: Kobe on 2002-06-07
Processing row 30/384: Ōita on 2002-06-10
Processing row 40/384: Suwon on 2002-06-13
Processing row 50/384: Ōita on 2002-06-16
Processing row 60/384: Seoul on 2002-06-25
Processing row 70/384: Nuremberg on 2006-06-11
Processing row 80/384: Leipzig on 2006-06-14
Processing row 90/384: Munich on 2006-06-18
Processing row 100/384: Frankfurt on 2006-06-21
Processing row 110/384: Kaiserslautern on 2006-06-23
Processing row 120/384: Hamburg on 2006-06-30
Processing row 130/384: Port Elizabeth on 2010-06-12
Processing row 140/384: Rustenburg on 2010-06-15
Processing row 150/384: Port Elizabeth on 2010-06-18
Processing row 160/384: Bloemfontein on 2010-06-22
Processing row 170/384: Polokwane on 2010-06-24
Processing row 180/384: Durban on 2010-06-28
Processing row 190/384: Port Elizabeth on 2010-07-10
Processing row 200/384: Brasília on 2014-0

In [14]:
# ------------------------------------------------------
# 6. Add columns to df and save
# ------------------------------------------------------
for key, values in weather_data.items():
    df[key] = values

output_filename = '../data-processed/matches_clean_weather.csv'
df.to_csv(output_filename, index=False)
print(f"Done! Updated file saved as {output_filename}")
print(df[['city_name', 'match_date', 'match_time', 'temp0', 'humid0']].head())

Done! Updated file saved as ../data-processed/matches_clean_weather.csv
  city_name  match_date match_time      temp0     humid0
0     Seoul  2002-05-31      20:30  17.914000  90.946243
1   Niigata  2002-06-01      15:30  22.896000  65.320168
2     Ulsan  2002-06-01      18:00  22.579498  76.212860
3   Sapporo  2002-06-01      20:30  14.346000  87.780296
4   Ibaraki  2002-06-02      14:30  26.892500  41.430252
