In [1]:
pip install pandas openmeteo-requests requests-cache retry-requests geopy

Collecting openmeteo-requests
  Downloading openmeteo_requests-1.7.5-py3-none-any.whl.metadata (11 kB)
Collecting requests-cache
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting retry-requests
  Downloading retry_requests-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting niquests>=3.15.2 (from openmeteo-requests)
  Downloading niquests-3.17.0-py3-none-any.whl.metadata (17 kB)
Collecting openmeteo-sdk>=1.22.0 (from openmeteo-requests)
  Downloading openmeteo_sdk-1.25.0-py3-none-any.whl.metadata (935 bytes)
Collecting cattrs>=22.2 (from requests-cache)
  Downloading cattrs-25.3.0-py3-none-any.whl.metadata (8.4 kB)
Collecting url-normalize>=1.4 (from requests-cache)
  Downloading url_normalize-2.2.1-py3-none-any.whl.metadata (5.6 kB)
Collecting attrs>=21.2 (from requests-cache)
  Using cached attrs-25.4.0-py3-none-any.whl.metadata (10 kB)
Collecting typing-extensions>=4.14.0 (from cattrs>=22.2->requests-cache)
  Using cached typing_extensions-4.15.0-py3-n

In [2]:
# =====================================================
# IMPORTS
# =====================================================
import pandas as pd
import requests_cache
import openmeteo_requests

from retry_requests import retry
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

import numpy as np
import time

In [11]:



# =====================================================
# CONSTANTS
# =====================================================
DATA_PATH = "C:\\Users\\ePadd\\OneDrive - University of Ghana\\Documents\\DATA SCIENCE\\DATA BLOOM\\ghana-maize-yield-prediction\\data\\raw\\maize_yield.csv"
RAW_WEATHER_PATH = "C:\\Users\\ePadd\\OneDrive - University of Ghana\\Documents\\DATA SCIENCE\\DATA BLOOM\\ghana-maize-yield-prediction\\data\\raw\\ghana_weather_raw.csv"
FINAL_OUTPUT_PATH = "C:\\Users\\ePadd\\OneDrive - University of Ghana\\Documents\\DATA SCIENCE\\DATA BLOOM\\ghana-maize-yield-prediction\\data\\processed\\maize_yield_with_weather.csv"

SEASON_MONTHS = [4, 5, 6, 7]  # Major season (Apr–Jul)
COUNTRY = "Ghana"


# =====================================================
# 1. LOAD & CLEAN DATA
# =====================================================
def clean_district_name(name: str) -> str:
    """Standardize district names for geocoding."""
    name = name.strip()

    if "(" in name:
        name = name.split("(")[0].strip()

    suffixes = ["Municipal", "Muni", "Metropolitan", "Metro", "District"]
    for suffix in suffixes:
        if name.lower().endswith(suffix.lower()):
            name = name[:-len(suffix)].strip()

    return name


def load_and_clean_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df["Cleaned_District"] = df["DISTRICT"].apply(clean_district_name)
    return df




In [4]:
# =====================================================
# 2. GEOCODING
# =====================================================
def geocode_districts(districts: list[str]) -> dict:
    geolocator = Nominatim(user_agent="ghana_maize_project_student")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.5)

    coords = {}

    for district in districts:
        try:
            location = geocode(f"{district}, {COUNTRY}")
            if location:
                coords[district] = {
                    "lat": location.latitude,
                    "lon": location.longitude
                }
            else:
                coords[district] = {"lat": None, "lon": None}

        except Exception:
            coords[district] = {"lat": None, "lon": None}

    return coords




In [5]:
# =====================================================
# 3. FETCH WEATHER DATA
# =====================================================
def setup_weather_client():
    cache_session = requests_cache.CachedSession(".cache", expire_after=-1)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    return openmeteo_requests.Client(session=retry_session)


def fetch_weather_data(coords: dict, start_year: int, end_year: int) -> pd.DataFrame:
    client = setup_weather_client()
    weather_frames = []

    for district, c in coords.items():
        if c["lat"] is None:
            continue

        params = {
            "latitude": c["lat"],
            "longitude": c["lon"],
            "start_date": f"{start_year}-01-01",
            "end_date": f"{end_year}-12-31",
            "daily": [
                "temperature_2m_mean",
                "rain_sum",
                "shortwave_radiation_sum",
                "soil_moisture_0_to_7cm_mean"
            ],
            "timezone": "GMT"
        }

        try:
            response = client.weather_api(
                "https://archive-api.open-meteo.com/v1/archive",
                params=params
            )[0]

            daily = response.Daily()
            dates = pd.date_range(
                start=pd.to_datetime(daily.Time(), unit="s", utc=True),
                end=pd.to_datetime(daily.TimeEnd(), unit="s", utc=True),
                freq=pd.Timedelta(seconds=daily.Interval()),
                inclusive="left"
            )

            weather_frames.append(pd.DataFrame({
                "Date": dates,
                "Cleaned_District": district,
                "Temperature": daily.Variables(0).ValuesAsNumpy(),
                "Rainfall": daily.Variables(1).ValuesAsNumpy(),
                "Sunlight": daily.Variables(2).ValuesAsNumpy(),
                "Soil_Moisture": daily.Variables(3).ValuesAsNumpy()
            }))

        except Exception:
            continue

    return pd.concat(weather_frames, ignore_index=True) if weather_frames else pd.DataFrame()




In [6]:
# =====================================================
# 4. FEATURE ENGINEERING & MERGE
# =====================================================
def aggregate_seasonal_weather(weather_df: pd.DataFrame) -> pd.DataFrame:
    weather_df["Year"] = weather_df["Date"].dt.year
    weather_df["Month"] = weather_df["Date"].dt.month

    season_df = weather_df[weather_df["Month"].isin(SEASON_MONTHS)].copy()
    season_df["Water_Avail_Index"] = (
        season_df["Rainfall"] * season_df["Soil_Moisture"]
    )

    return (
        season_df
        .groupby(["Cleaned_District", "Year"])
        .agg({
            "Temperature": "mean",
            "Rainfall": "sum",
            "Sunlight": "sum",
            "Soil_Moisture": "mean",
            "Water_Avail_Index": "mean"
        })
        .reset_index()
    )




In [12]:
# =====================================================
# MAIN PIPELINE
# =====================================================
def main():
    df = load_and_clean_data(DATA_PATH)

    districts = df["Cleaned_District"].unique()
    coords = geocode_districts(districts)

    start_year, end_year = df["YEAR"].min(), df["YEAR"].max()
    weather_df = fetch_weather_data(coords, start_year, end_year)

    if weather_df.empty:
        print("No weather data retrieved.")
        return

    weather_df.to_csv(RAW_WEATHER_PATH, index=False)

    weather_summary = aggregate_seasonal_weather(weather_df)

    final_df = df.merge(
        weather_summary,
        left_on=["Cleaned_District", "YEAR"],
        right_on=["Cleaned_District", "Year"],
        how="left"
    )

    final_df.to_csv(FINAL_OUTPUT_PATH, index=False)
    print("Pipeline completed successfully.")


if __name__ == "__main__":
    main()


Pipeline completed successfully.


In [15]:


# ==========================================
# 1. ADVANCED CLEANING CONFIGURATION
# ==========================================

# MANUAL OVERRIDES: These are the ones that confuse the Geocoder.
# I have mapped your dataset's weird names to real Ghana cities.
manual_fixes = {
    "k. m. a.": "Kumasi",
    "a.m.a": "Accra",
    "s.t.m.a": "Sekondi-Takoradi",
    "sekondi-takoradi municipal (stma)": "Sekondi-Takoradi",
    "bolga": "Bolgatanga",
    "bolga municipal": "Bolgatanga",
    "bawku municipal": "Bawku",
    "kassena nankana muni.": "Navrongo", # Capital of Kassena Nankana
    "kassena nankana east": "Navrongo",
    "kassena nankana west": "Paga",
    "komenda-edina-eguafo-abirem": "Elmina", # Capital
    "suhum kraboa coaltar": "Suhum",
    "twifo-herman/l. denkyira": "Twifo Hemang",
    "fanteakwa": "Begoro", # Capital
    "asikuma-odoben-brakwa": "Breman Asikuma",
    "ajumako-enyan-esiam": "Ajumako",
    "ajumako-essiam-enyana": "Ajumako",
    "abura-asebu-kwamankese": "Abura Dunkwa",
    "ga east": "Abokobi",
    "ga west": "Amasaman",
    "ga south": "Weija",
    "ledzokuku-krowor": "Teshie",
    "dangme east": "Ada Foah",
    "dangme west": "Dodowa",
    "akuapim north": "Akropong",
    "akuapim south": "Aburi",
    "yilo krobo": "Somanya",
    "manya krobo": "Odumase Krobo",
    "savelugu/nanton": "Savelugu",
    "tolon/kumbungu": "Tolon",
    "zabzugu/tatale": "Zabzugu",
    "bunkpurugu/yunyoo": "Bunkpurugu",
    "sawla/tuna/kalba": "Sawla",
    "central gonja": "Buipe",
    "west gonja": "Damongo",
    "east gonja": "Salaga",
    "north tongu": "Battor Dugame",
    "south tongu": "Sogakope",
    "biakoye": "Nkonya",
    "jasikan": "Jasikan",
    "kadjebi": "Kadjebi",
    "krachi east": "Dambai",
    "krachi west": "Kete Krachi",
    "nkwanta": "Nkwanta",
    "bosumtwe-atwima-kwanwoma": "Kuntanase",
    "ahafo ano north": "Tepa",
    "ahafo ano south": "Mankranso",
    "atwima mponua": "Nyinahin",
    "atwima nwabiagya": "Nkawie",
    "ejusu juaben": "Ejisu",
    "offinso": "Offinso",
    "adansi north": "Fomena",
    "adansi south": "New Edubiase",
    "afigya sekyere": "Agona Ashanti",
    "sekyere east": "Effiduase",
    "sekyere west": "Mampong",
    "asante akim north": "Konongo",
    "asante akim south": "Juaso"
}

def robust_clean_district(name):
    # 0. Basic cleaning
    name = str(name).strip().lower()
    
    # 1. Check Manual Dictionary first (Fastest fix)
    if name in manual_fixes:
        return manual_fixes[name]
    
    # 2. Handle Slashes (Take the first town: "Tolon/Kumbungu" -> "Tolon")
    if '/' in name:
        name = name.split('/')[0].strip()
        
    # 3. Handle Parentheses (Remove them: "Bekwai (Amansie)" -> "Bekwai")
    if '(' in name:
        name = name.split('(')[0].strip()
        
    # 4. Remove Administrative Junk Words
    junk_words = ['municipal', 'muni', 'muni.', 'metropolitan', 'metro', 'district']
    for word in junk_words:
        if name.endswith(word):
            name = name.replace(word, '').strip()
            
    return name.title() # Return "Tolon" instead of "tolon"

# ==========================================
# 2. MAIN PROCESS
# ==========================================
print("Loading dataset...")
df = pd.read_csv(DATA_PATH)

print("Applying 'Super Cleaner' logic...")
df['Cleaned_District'] = df['DISTRICT'].apply(robust_clean_district)

unique_districts = df['Cleaned_District'].unique()
print(f"Cleaned down to {len(unique_districts)} unique locations.")

# ==========================================
# 3. GEOCODING WITH FALLBACK
# ==========================================
print("\nStarting Geocoding...")
geolocator = Nominatim(user_agent="ghana_maize_thesis_fixer")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1.5)

district_coords = {}
failed_districts = []

for district in unique_districts:
    # Append ", Ghana" to help the geocoder context
    query = f"{district}, Ghana"
    
    try:
        location = geocode(query)
        if location:
            district_coords[district] = {'lat': location.latitude, 'lon': location.longitude}
            print(f"  ✓ Found: {district} ({location.latitude:.3f}, {location.longitude:.3f})")
        else:
            print(f"  ✗ FAILED: {district} (Will be skipped)")
            failed_districts.append(district)
            district_coords[district] = {'lat': None, 'lon': None}
            
    except Exception as e:
        print(f"  ! Error: {district} - {e}")
        district_coords[district] = {'lat': None, 'lon': None}

if failed_districts:
    print(f"\nWARNING: {len(failed_districts)} districts still failed geocoding:")
    print(failed_districts)
    print("You may need to add these to the 'manual_fixes' dictionary at the top of the script.")

# ==========================================
# 4. WEATHER FETCH (Same as before)
# ==========================================
print("\nFetching Weather Data (Open-Meteo)...")
# [This part is identical to the previous script, using district_coords]
# ... (Use the previous weather fetching code here)

Loading dataset...
Applying 'Super Cleaner' logic...
Cleaned down to 197 unique locations.

Starting Geocoding...
  ✓ Found: Bekwai (6.457, -1.584)
  ✓ Found: Amansie Central (6.207, -1.828)
  ✓ Found: Amansie West (6.440, -1.852)
  ✓ Found: Ejura Sekyedumase (7.746, -0.987)
  ✓ Found: Mampong (7.056, -1.404)
  ✓ Found: Effiduase (6.851, -1.395)
  ✓ Found: Agona Ashanti (6.934, -1.489)
  ✓ Found: Tepa (7.005, -2.164)
  ✓ Found: Mankranso (6.818, -1.863)
  ✓ Found: Nyinahin (6.599, -2.117)
  ✓ Found: Nkawie (6.668, -1.810)
  ✓ Found: Ejisu (6.714, -1.467)
  ✓ Found: Kuntanase (6.541, -1.477)
  ✓ Found: Kwabre (6.790, -1.538)
  ✓ Found: Offinso (6.931, -1.667)
  ✓ Found: Fomena (6.281, -1.512)
  ✓ Found: Obuasi (6.212, -1.689)
  ✓ Found: Adansi  South (6.064, -1.396)
  ✓ Found: Konongo (6.623, -1.215)
  ✓ Found: Juaso (6.583, -1.119)
  ✓ Found: Kumasi (6.699, -1.623)
  ✓ Found: Bosome Freho (6.368, -1.366)
  ✓ Found: Sekyere Central (6.991, -1.367)
  ✓ Found: Sekyere Afram Plains (7.074,