<a href="https://colab.research.google.com/github/ccspen21/greenland-fishery-nowcast-2025/blob/main/periodic_update.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests pandas pyjstat
import os
import sqlite3
import pandas as pd
import requests
from pyjstat import pyjstat
from urllib.parse import quote
from io import StringIO
from IPython.display import display
from google.colab import drive
import time
from datetime import datetime

# Mount Google Drive
drive.mount('/content/drive')

# Define database path
DB_PATH = '/content/drive/MyDrive/greenland_fishery.db'

# validate DataFrame
def validate_dataframe(df, expected_columns, dtypes):
    if df.empty:
        raise ValueError("DataFrame is empty, no rows found.")
    if not all(col in df.columns for col in expected_columns):
        raise ValueError(f"DataFrame missing expected columns: {expected_columns}")
    for col, dtype in dtypes.items():
        if col in df.columns:
            if dtype == int:
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
            else:
                df[col] = df[col].astype(dtype)
    if df.isnull().any().any():
        raise ValueError(f"DataFrame contains NaN values: {df.head()}")



In [12]:
# Connect to SQLite database
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
print(f"Connected to SQLite database at {DB_PATH}")

# execute SQL scripts
def execute_sql_script(file_path):
    try:
        with open(file_path, 'r') as file:
            sql_script = file.read()
        cursor.executescript(sql_script)
        conn.commit()
        print(f"Successfully executed SQL script: {file_path}")
    except Exception as e:
        print(f"Error executing SQL script {file_path}: {e}")
        raise

# check for existing data
def check_existing_data(table_name, year, quarter):
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,))
    if not cursor.fetchone():
        print(f"Table {table_name} does not exist.")
        return False
    query = f"SELECT COUNT(*) FROM {table_name} WHERE Year = ? AND Quarter = ?"
    cursor.execute(query, (year, quarter))
    count = cursor.fetchone()[0]
    return count > 0

# Function to get latest data point from table
def get_latest_db_data(table_name):
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?", (table_name,))
    if not cursor.fetchone():
        print(f"Table {table_name} does not exist, assuming no data.")
        return 2010, "Q4"
    query = f"SELECT MAX(Year), Quarter FROM {table_name} WHERE Year = (SELECT MAX(Year) FROM {table_name})"
    cursor.execute(query)
    result = cursor.fetchone()
    return result if result and result[0] else (2010, "Q4")

Connected to SQLite database at greenland_fishery.db


In [13]:
# Helper function to validate DataFrame against schema
def validate_dataframe(df, expected_columns, dtypes):
    if not all(col in df.columns for col in expected_columns):
        raise ValueError(f"DataFrame missing expected columns: {expected_columns}")
    for col, dtype in dtypes.items():
        if col in df.columns:
            df[col] = df[col].astype(dtype)
    if df.isnull().any().any():
        raise ValueError(f"DataFrame contains NaN values: {df.head()}")

In [17]:
# CELL 3: UPDATE TOTAL CATCH
# Define quarter order
quarter_order = ["Q1", "Q2", "Q3", "Q4"]

# Get latest data point from database
latest_year, latest_quarter = get_latest_db_data('total_catch')
print(f"Latest database data point: {latest_year} {latest_quarter}")

# Check API for latest available data
url = "https://bank.stat.gl:443/api/v1/en/Greenland/FI/FI10/FIX008.px"
api_year, api_quarter = get_latest_quarter_from_api(url, nation="GRL")
print(f"Latest API data point: {api_year} {api_quarter}")

# Determine if update is needed
update_needed = False
if api_year and api_quarter:
    db_idx = latest_year * 4 + quarter_order.index(latest_quarter)
    api_idx = api_year * 4 + quarter_order.index(api_quarter)
    if api_idx > db_idx:
        update_needed = True
        next_year, next_quarter = api_year, api_quarter
    else:
        print("No new data available for total_catch.")
else:
    print("Could not fetch latest quarter from API, skipping total_catch update.")

if update_needed and not check_existing_data('total_catch', next_year, next_quarter):
    # Fetch new data
    query = {
        "query": [
            {"code": "nation", "selection": {"filter": "item", "values": ["GRL"]}},
            {"code": "unit", "selection": {"filter": "item", "values": ["Ton"]}},
            {"code": "time", "selection": {"filter": "item", "values": [str(next_year)]}},
            {"code": "quarter", "selection": {"filter": "item", "values": [str(quarter_order.index(next_quarter) + 1)]}}
        ],
        "response": {"format": "json-stat2"}
    }
    try:
        response = fetch_with_retries(url, max_retries=3, timeout=60, method='post', json=query)
        dataset = pyjstat.Dataset.read(response.text)
        df = dataset.write('dataframe')
        print("Data successfully retrieved and converted to DataFrame!")

        # Clean DataFrame
        df_new = df.copy()
        df_new.drop(columns=['nation'], inplace=True)
        df_new.rename(columns={
            "time": "Year",
            "quarter": "Quarter",
            "unit": "Unit",
            "value": "Total_Catch"
        }, inplace=True)
        df_new["Quarter"] = df_new["Quarter"].str.replace("Quarter ", "Q")
        df_new["Quarter"] = pd.Categorical(df_new["Quarter"], categories=quarter_order, ordered=True)
        df_new = df_new[["Year", "Quarter", "Unit", "Total_Catch"]]
        df_new["Year"] = df_new["Year"].astype(int)

        # Validate
        expected_columns = ["Year", "Quarter", "Unit", "Total_Catch"]
        dtypes = {"Year": int, "Quarter": str, "Unit": str, "Total_Catch": int}
        validate_dataframe(df_new, expected_columns, dtypes)

        # Generate DML statements
        dml_statements = []
        for _, row in df_new.iterrows():
            dml_statements.append(
                f"INSERT INTO total_catch (Year, Quarter, Unit, Total_Catch) VALUES "
                f"({row['Year']}, '{row['Quarter']}', '{row['Unit']}', {row['Total_Catch']})"
            )

        # Append DML statements
        with open('/content/drive/MyDrive/dml_populate.sql', 'a') as f:
            f.write("\n-- Update for total_catch\n")
            f.write("\n".join(dml_statements) + ";\n")

        # Execute DML
        execute_sql_script('/content/drive/MyDrive/dml_populate.sql')
        print("Updated total_catch table with new data")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching new Total Catch data: {e}")

# Display updated data
print("Updated Total Catch DataFrame:")
df_updated = pd.read_sql_query("SELECT * FROM total_catch WHERE Year = ? AND Quarter = ?", conn, params=(next_year, next_quarter))
display(df_updated)

OperationalError: no such table: total

In [None]:
# CELL 5: UPDATE FISH EXPORTS
# Get latest data point from database
latest_year, latest_quarter = get_latest_db_data('fish_exports')
print(f"Latest database data point: {latest_year} {latest_quarter}")

# Check API for latest available data
url = "https://bank.stat.gl:443/api/v1/en/Greenland/BE/BE80/BEXSTA22.px"
api_year, api_quarter = get_latest_quarter_from_api(url, unit="Mill. kr.")
print(f"Latest API data point: {api_year} {api_quarter}")

# Determine if update is needed
update_needed = False
if api_year and api_quarter:
    db_idx = latest_year * 4 + quarter_order.index(latest_quarter)
    api_idx = api_year * 4 + quarter_order.index(api_quarter)
    if api_idx > db_idx:
        update_needed = True
        next_year, next_quarter = api_year, api_quarter
    else:
        print("No new data available for fish_exports.")
else:
    print("Could not fetch latest quarter from API, skipping fish_exports update.")

if update_needed and not check_existing_data('fish_exports', next_year, next_quarter):
    # Fetch new data
    query = {
        "query": [
            {"code": "unit", "selection": {"filter": "item", "values": ["Mill. kr."]}},
            {"code": "time", "selection": {"filter": "item", "values": [str(next_year)]}},
            {"code": "quarter", "selection": {"filter": "item", "values": [str(quarter_order.index(next_quarter) + 1)]}}
        ],
        "response": {"format": "json-stat2"}
    }
    try:
        response = fetch_with_retries(url, max_retries=3, timeout=60, method='post', json=query)
        dataset = pyjstat.Dataset.read(response.text)
        df = dataset.write('dataframe')
        print("Data successfully retrieved and converted to DataFrame!")

        # Clean DataFrame
        df_new = df.copy()
        df_new.rename(columns={
            "time": "Year",
            "quarter": "Quarter",
            "value": "Fish_Export_Value_Million_Kr"
        }, inplace=True)
        df_new["Quarter"] = df_new["Quarter"].str.replace("Quarter ", "Q")
        df_new["Quarter"] = pd.Categorical(df_new["Quarter"], categories=quarter_order, ordered=True)
        df_new = df_new[["Year", "Quarter", "Fish_Export_Value_Million_Kr"]]
        df_new["Year"] = df_new["Year"].astype(int)

        # Validate
        expected_columns = ["Year", "Quarter", "Fish_Export_Value_Million_Kr"]
        dtypes = {"Year": int, "Quarter": str, "Fish_Export_Value_Million_Kr": int}
        validate_dataframe(df_new, expected_columns, dtypes)

        # Generate DML statements
        dml_statements = []
        for _, row in df_new.iterrows():
            dml_statements.append(
                f"INSERT INTO fish_exports (Year, Quarter, Fish_Export_Value_Million_Kr) VALUES "
                f"({row['Year']}, '{row['Quarter']}', {row['Fish_Export_Value_Million_Kr']})"
            )

        # Append DML statements
        with open('/content/drive/MyDrive/dml_populate.sql', 'a') as f:
            f.write("\n-- Update for fish_exports\n")
            f.write("\n".join(dml_statements) + ";\n")

        # Execute DML
        execute_sql_script('/content/drive/MyDrive/dml_populate.sql')
        print("Updated fish_exports table with new data")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching new Fish Exports data: {e}")

# Display updated data
print("Updated Fish Exports DataFrame:")
df_updated = pd.read_sql_query("SELECT * FROM fish_exports WHERE Year = ? AND Quarter = ?", conn, params=(next_year, next_quarter))
display(df_updated)

In [None]:
#  UPDATE FOREIGN CATCH
# Get latest data point from database
latest_year, latest_quarter = get_latest_db_data('foreign_catch')
print(f"Latest database data point: {latest_year} {latest_quarter}")

# Check API for latest available data
api_year, api_quarter = get_latest_quarter_from_api(url, nation="UDL")
print(f"Latest API data point: {api_year} {api_quarter}")

# Determine if update is needed
update_needed = False
if api_year and api_quarter:
    db_idx = latest_year * 4 + quarter_order.index(latest_quarter)
    api_idx = api_year * 4 + quarter_order.index(api_quarter)
    if api_idx > db_idx:
        update_needed = True
        next_year, next_quarter = api_year, api_quarter
    else:
        print("No new data available for foreign_catch.")
else:
    print("Could not fetch latest quarter from API, skipping foreign_catch update.")

if update_needed and not check_existing_data('foreign_catch', next_year, next_quarter):
    # Fetch new data
    query = {
        "query": [
            {"code": "nation", "selection": {"filter": "item", "values": ["UDL"]}},
            {"code": "unit", "selection": {"filter": "item", "values": ["Ton"]}},
            {"code": "time", "selection": {"filter": "item", "values": [str(next_year)]}},
            {"code": "quarter", "selection": {"filter": "item", "values": [str(quarter_order.index(next_quarter) + 1)]}}
        ],
        "response": {"format": "json-stat2"}
    }
    try:
        response = fetch_with_retries(url, max_retries=3, timeout=60, method='post', json=query)
        dataset = pyjstat.Dataset.read(response.text)
        df = dataset.write('dataframe')
        print("Data successfully retrieved and converted to DataFrame!")

        # Clean DataFrame
        df_new = df.copy()
        df_new.drop(columns=['nation'], inplace=True)
        df_new.rename(columns={
            "time": "Year",
            "quarter": "Quarter",
            "unit": "Unit",
            "value": "Foreign_Catch"
        }, inplace=True)
        df_new["Quarter"] = df_new["Quarter"].str.replace("Quarter ", "Q")
        df_new["Quarter"] = pd.Categorical(df_new["Quarter"], categories=quarter_order, ordered=True)
        df_new = df_new[["Year", "Quarter", "Unit", "Foreign_Catch"]]
        df_new["Year"] = df_new["Year"].astype(int)

        # Validate
        expected_columns = ["Year", "Quarter", "Unit", "Foreign_Catch"]
        dtypes = {"Year": int, "Quarter": str, "Unit": str, "Foreign_Catch": int}
        validate_dataframe(df_new, expected_columns, dtypes)

        # Generate DML statements
        dml_statements = []
        for _, row in df_new.iterrows():
            dml_statements.append(
                f"INSERT INTO foreign_catch (Year, Quarter, Unit, Foreign_Catch) VALUES "
                f"({row['Year']}, '{row['Quarter']}', '{row['Unit']}', {row['Foreign_Catch']})"
            )

        # Append DML statements
        with open('/content/drive/MyDrive/dml_populate.sql', 'a') as f:
            f.write("\n-- Update for foreign_catch\n")
            f.write("\n".join(dml_statements) + ";\n")

        # Execute DML
        execute_sql_script('/content/drive/MyDrive/dml_populate.sql')
        print("Updated foreign_catch table with new data")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching new Foreign Catch data: {e}")

# Display updated data
print("Updated Foreign Catch DataFrame:")
df_updated = pd.read_sql_query("SELECT * FROM foreign_catch WHERE Year = ? AND Quarter = ?", conn, params=(next_year, next_quarter))
display(df_updated)

In [None]:
#ERDDAP SETUP

# api retry
def fetch_with_retries(url, max_retries=3, timeout=60, method='get', json=None):
    for attempt in range(max_retries):
        try:
            if method == 'get':
                response = requests.get(url, timeout=timeout)
            else:
                response = requests.post(url, json=json, timeout=timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt+1} failed: {e}")
            if attempt + 1 == max_retries:
                raise
            time.sleep(2 ** attempt)

# Helper function to get latest available quarter from bank.stat.gl
def get_latest_quarter_from_api(url, nation=None, unit="Ton"):
    query = {
        "query": [
            {"code": "time", "selection": {"filter": "all", "values": ["*"]}},
            {"code": "quarter", "selection": {"filter": "all", "values": ["*"]}}
        ],
        "response": {"format": "json-stat2"}
    }
    if nation:
        query["query"].insert(0, {"code": "nation", "selection": {"filter": "item", "values": [nation]}})
    query["query"].append({"code": "unit", "selection": {"filter": "item", "values": [unit]}})
    try:
        response = fetch_with_retries(url, max_retries=3, timeout=60, method='post', json=query)
        dataset = pyjstat.Dataset.read(response.text)
        df = dataset.write('dataframe')
        df["time"] = df["time"].astype(int)
        latest_year = df["time"].max()
        latest_quarter = df[df["time"] == latest_year]["quarter"].max().replace("Quarter ", "Q")
        return latest_year, latest_quarter
    except Exception as e:
        print(f"Error fetching latest quarter: {e}")
        return None, None




In [None]:
# CELL 6: UPDATE WEST GREENLAND SST
# Define quarter to months mapping
quarter_to_months = {
    "Q1": ("01-01", "03-31"),
    "Q2": ("04-01", "06-30"),
    "Q3": ("07-01", "09-30"),
    "Q4": ("10-01", "12-31")
}

# Degree to ERDDAP grid index conversion
def deg_to_index_lat(lat): return int(round((lat + 90) / 0.25))
def deg_to_index_lon(lon): return int(round((lon + 180) / 0.25))

# Define bounding box for West Greenland
bbox_deg_west = {
    'lat_min': 65.0,
    'lat_max': 70.0,
    'lon_min': -55.0,
    'lon_max': -50.0
}

# Convert to grid indices
bbox_idx_west = {
    'lat_min': deg_to_index_lat(bbox_deg_west['lat_min']),
    'lat_max': deg_to_index_lat(bbox_deg_west['lat_max']),
    'lon_min': deg_to_index_lon(bbox_deg_west['lon_min']),
    'lon_max': deg_to_index_lon(bbox_deg_west['lon_max'])
}

# Get latest data point from database
latest_year, latest_quarter = get_latest_db_data('sst_west')
print(f"Latest database data point: {latest_year} {latest_quarter}")

# Determine next quarter
quarter_idx = (quarter_order.index(latest_quarter) + 1) % 4
next_year = latest_year if quarter_idx != 0 else latest_year + 1
next_quarter = quarter_order[quarter_idx]

# Check if enough time has passed
current_date = datetime.now()
quarter_end_dates = {
    "Q1": datetime(next_year, 3, 31),
    "Q2": datetime(next_year, 6, 30),
    "Q3": datetime(next_year, 9, 30),
    "Q4": datetime(next_year, 12, 31)
}
if current_date <= quarter_end_dates[next_quarter]:
    print(f"Current date {current_date.date()} is before {next_quarter} end ({quarter_end_dates[next_quarter].date()}), skipping sst_west update.")
else:
    print(f"Fetching data for: {next_year} {next_quarter}")
    if not check_existing_data('sst_west', next_year, next_quarter):
        # Map quarter to months
        start_month, end_month = quarter_to_months[next_quarter]

        # Fetch new data
        try:
            base = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?"
            var = "sst"
            time = f"[({next_year}-{start_month}T00:00:00Z):1:({next_year}-{end_month}T00:00:00Z)]".replace(" ", "")
            zlev = "[0:1:0]"
            lat = f"[({bbox_idx_west['lat_min']}):1:({bbox_idx_west['lat_max']})]"
            lon = f"[({bbox_idx_west['lon_min']}):1:({bbox_idx_west['lon_max']})]"
            query = f"{var}{time}{zlev}{lat}{lon}"
            full_url = base + quote(query, safe=":/[](),-T")
            print("Constructed URL:", full_url)

            response = fetch_with_retries(full_url, max_retries=3, timeout=60, method='get')
            df = pd.read_csv(StringIO(response.text), skiprows=[1])
            df = df.rename(columns={col: col.strip() for col in df.columns})
            df = df.dropna(subset=["sst"])

            df["time"] = pd.to_datetime(df["time"])
            df["Year"] = df["time"].dt.year.astype(int)
            df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)

            df_new = df.groupby(['Year', 'Quarter'])["sst"].mean().reset_index()
            df_new = df_new.rename(columns={"sst": "Sea_Surface_Temp_C_West"})

            df_new["Melt_Active_West"] = (df_new["Sea_Surface_Temp_C_West"] > 0.5).astype(int)
            df_new["Melt_Index_West"] = df_new["Sea_Surface_Temp_C_West"].clip(lower=0, upper=4) / 4

            # Validate
            expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_West", "Melt_Active_West", "Melt_Index_West"]
            dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_West": float, "Melt_Active_West": int, "Melt_Index_West": float}
            validate_dataframe(df_new, expected_columns, dtypes)

            # Generate DML statements
            dml_statements = []
            for _, row in df_new.iterrows():
                dml_statements.append(
                    f"INSERT INTO sst_west (Year, Quarter, Sea_Surface_Temp_C_West, Melt_Active_West, Melt_Index_West) VALUES "
                    f"({row['Year']}, '{row['Quarter']}', {row['Sea_Surface_Temp_C_West']}, {row['Melt_Active_West']}, {row['Melt_Index_West']})"
                )

            # Append DML statements
            with open('/content/drive/MyDrive/dml_populate.sql', 'a') as f:
                f.write("\n-- Update for sst_west\n")
                f.write("\n".join(dml_statements) + ";\n")

            # Execute DML
            execute_sql_script('/content/drive/MyDrive/dml_populate.sql')
            print("Updated sst_west table with new data")
        except requests.exceptions.RequestException as e:
            print(f"Error fetching new SST West data: {e}")

# Display updated data
print("Updated SST West DataFrame:")
df_updated = pd.read_sql_query("SELECT * FROM sst_west WHERE Year = ? AND Quarter = ?", conn, params=(next_year, next_quarter))
display(df_updated)

In [None]:
# CELL 7: UPDATE EAST GREENLAND SST
# Define bounding box for East Greenland
bbox_deg_east = {
    'lat_min': 65.0,
    'lat_max': 70.0,
    'lon_min': -40.0,
    'lon_max': -35.0
}

# Convert to grid indices
bbox_idx_east = {
    'lat_min': deg_to_index_lat(bbox_deg_east['lat_min']),
    'lat_max': deg_to_index_lat(bbox_deg_east['lat_max']),
    'lon_min': deg_to_index_lon(bbox_deg_east['lon_min']),
    'lon_max': deg_to_index_lon(bbox_deg_east['lon_max'])
}

# Get latest data point from database
latest_year, latest_quarter = get_latest_db_data('sst_east')
print(f"Latest database data point: {latest_year} {latest_quarter}")

# Determine next quarter
quarter_idx = (quarter_order.index(latest_quarter) + 1) % 4
next_year = latest_year if quarter_idx != 0 else latest_year + 1
next_quarter = quarter_order[quarter_idx]

# Check if enough time has passed
if current_date <= quarter_end_dates[next_quarter]:
    print(f"Current date {current_date.date()} is before {next_quarter} end ({quarter_end_dates[next_quarter].date()}), skipping sst_east update.")
else:
    print(f"Fetching data for: {next_year} {next_quarter}")
    if not check_existing_data('sst_east', next_year, next_quarter):
        # Map quarter to months
        start_month, end_month = quarter_to_months[next_quarter]

        # Fetch new data
        try:
            base = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?"
            var = "sst"
            time = f"[({next_year}-{start_month}T00:00:00Z):1:({next_year}-{end_month}T00:00:00Z)]".replace(" ", "")
            zlev = "[0:1:0]"
            lat = f"[({bbox_idx_east['lat_min']}):1:({bbox_idx_east['lat_max']})]"
            lon = f"[({bbox_idx_east['lon_min']}):1:({bbox_idx_east['lon_max']})]"
            query = f"{var}{time}{zlev}{lat}{lon}"
            full_url = base + quote(query, safe=":/[](),-T")
            print("Constructed URL:", full_url)

            response = fetch_with_retries(full_url, max_retries=3, timeout=60, method='get')
            df = pd.read_csv(StringIO(response.text), skiprows=[1])
            df = df.rename(columns={col: col.strip() for col in df.columns})
            df = df.dropna(subset=["sst"])

            df["time"] = pd.to_datetime(df["time"])
            df["Year"] = df["time"].dt.year.astype(int)
            df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)

            df_new = df.groupby(['Year', 'Quarter'])["sst"].mean().reset_index()
            df_new = df_new.rename(columns={"sst": "Sea_Surface_Temp_C_East"})

            df_new["Melt_Active_East"] = (df_new["Sea_Surface_Temp_C_East"] > 0.5).astype(int)
            df_new["Melt_Index_East"] = df_new["Sea_Surface_Temp_C_East"].clip(lower=0, upper=4) / 4

            # Validate
            expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_East", "Melt_Active_East", "Melt_Index_East"]
            dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_East": float, "Melt_Active_East": int, "Melt_Index_East": float}
            validate_dataframe(df_new, expected_columns, dtypes)

            # Generate DML statements
            dml_statements = []
            for _, row in df_new.iterrows():
                dml_statements.append(
                    f"INSERT INTO sst_east (Year, Quarter, Sea_Surface_Temp_C_East, Melt_Active_East, Melt_Index_East) VALUES "
                    f"({row['Year']}, '{row['Quarter']}', {row['Sea_Surface_Temp_C_East']}, {row['Melt_Active_East']}, {row['Melt_Index_East']})"
                )

            # Append DML statements
            with open('/content/drive/MyDrive/dml_populate.sql', 'a') as f:
                f.write("\n-- Update for sst_east\n")
                f.write("\n".join(dml_statements) + ";\n")

            # Execute DML
            execute_sql_script('/content/drive/MyDrive/dml_populate.sql')
            print("Updated sst_east table with new data")
        except requests.exceptions.RequestException as e:
            print(f"Error fetching new SST East data: {e}")

# Display updated data
print("Updated SST East DataFrame:")
df_updated = pd.read_sql_query("SELECT * FROM sst_east WHERE Year = ? AND Quarter = ?", conn, params=(next_year, next_quarter))
display(df_updated)

In [None]:
# CELL 8: UPDATE SOUTH GREENLAND SST
# Define bounding box for South Greenland
bbox_deg_south = {
    'lat_min': 60.0,
    'lat_max': 65.0,
    'lon_min': -45.0,
    'lon_max': -40.0
}

# Convert to grid indices
bbox_idx_south = {
    'lat_min': deg_to_index_lat(bbox_deg_south['lat_min']),
    'lat_max': deg_to_index_lat(bbox_deg_south['lat_max']),
    'lon_min': deg_to_index_lon(bbox_deg_south['lon_min']),
    'lon_max': deg_to_index_lon(bbox_deg_south['lon_max'])
}

# Get latest data point from database
latest_year, latest_quarter = get_latest_db_data('sst_south')
print(f"Latest database data point: {latest_year} {latest_quarter}")

# Determine next quarter
quarter_idx = (quarter_order.index(latest_quarter) + 1) % 4
next_year = latest_year if quarter_idx != 0 else latest_year + 1
next_quarter = quarter_order[quarter_idx]

# Check if enough time has passed
if current_date <= quarter_end_dates[next_quarter]:
    print(f"Current date {current_date.date()} is before {next_quarter} end ({quarter_end_dates[next_quarter].date()}), skipping sst_south update.")
else:
    print(f"Fetching data for: {next_year} {next_quarter}")
    if not check_existing_data('sst_south', next_year, next_quarter):
        # Map quarter to months
        start_month, end_month = quarter_to_months[next_quarter]

        # Fetch new data
        try:
            base = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?"
            var = "sst"
            time = f"[({next_year}-{start_month}T00:00:00Z):1:({next_year}-{end_month}T00:00:00Z)]".replace(" ", "")
            zlev = "[0:1:0]"
            lat = f"[({bbox_idx_south['lat_min']}):1:({bbox_idx_south['lat_max']})]"
            lon = f"[({bbox_idx_south['lon_min']}):1:({bbox_idx_south['lon_max']})]"
            query = f"{var}{time}{zlev}{lat}{lon}"
            full_url = base + quote(query, safe=":/[](),-T")
            print("Constructed URL:", full_url)

            response = fetch_with_retries(full_url, max_retries=3, timeout=60, method='get')
            df = pd.read_csv(StringIO(response.text), skiprows=[1])
            df = df.rename(columns={col: col.strip() for col in df.columns})
            df = df.dropna(subset=["sst"])

            df["time"] = pd.to_datetime(df["time"])
            df["Year"] = df["time"].dt.year.astype(int)
            df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)

            df_new = df.groupby(['Year', 'Quarter'])["sst"].mean().reset_index()
            df_new = df_new.rename(columns={"sst": "Sea_Surface_Temp_C_South"})

            df_new["Melt_Active_South"] = (df_new["Sea_Surface_Temp_C_South"] > 0.5).astype(int)
            df_new["Melt_Index_South"] = df_new["Sea_Surface_Temp_C_South"].clip(lower=0, upper=4) / 4

            # Validate
            expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_South", "Melt_Active_South", "Melt_Index_South"]
            dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_South": float, "Melt_Active_South": int, "Melt_Index_South": float}
            validate_dataframe(df_new, expected_columns, dtypes)

            # Generate DML statements
            dml_statements = []
            for _, row in df_new.iterrows():
                dml_statements.append(
                    f"INSERT INTO sst_south (Year, Quarter, Sea_Surface_Temp_C_South, Melt_Active_South, Melt_Index_South) VALUES "
                    f"({row['Year']}, '{row['Quarter']}', {row['Sea_Surface_Temp_C_South']}, {row['Melt_Active_South']}, {row['Melt_Index_South']})"
                )

            # Append DML statements
            with open('/content/drive/MyDrive/dml_populate.sql', 'a') as f:
                f.write("\n-- Update for sst_south\n")
                f.write("\n".join(dml_statements) + ";\n")

            # Execute DML
            execute_sql_script('/content/drive/MyDrive/dml_populate.sql')
            print("Updated sst_south table with new data")
        except requests.exceptions.RequestException as e:
            print(f"Error fetching new SST South data: {e}")

# Display updated data
print("Updated SST South DataFrame:")
df_updated = pd.read_sql_query("SELECT * FROM sst_south WHERE Year = ? AND Quarter = ?", conn, params=(next_year, next_quarter))
display(df_updated)

In [None]:
# CELL 9: UPDATE ICE MELT SST
# Get latest data point from database
latest_year, latest_quarter = get_latest_db_data('ice_melt_sst')
print(f"Latest database data point: {latest_year} {latest_quarter}")

# Determine next quarter
quarter_idx = (quarter_order.index(latest_quarter) + 1) % 4
next_year = latest_year if quarter_idx != 0 else latest_year + 1
next_quarter = quarter_order[quarter_idx]

# Check if enough time has passed
if current_date <= quarter_end_dates[next_quarter]:
    print(f"Current date {current_date.date()} is before {next_quarter} end ({quarter_end_dates[next_quarter].date()}), skipping ice_melt_sst update.")
else:
    print(f"Computing data for: {next_year} {next_quarter}")
    if not check_existing_data('ice_melt_sst', next_year, next_quarter):
        # Fetch latest SST data
        df_sst_east = pd.read_sql_query("SELECT * FROM sst_east WHERE Year = ? AND Quarter = ?", conn, params=(next_year, next_quarter))
        df_sst_west = pd.read_sql_query("SELECT * FROM sst_west WHERE Year = ? AND Quarter = ?", conn, params=(next_year, next_quarter))

        if not df_sst_east.empty and not df_sst_west.empty:
            # Normalize SST
            sst_east_norm = (df_sst_east["Sea_Surface_Temp_C_East"].iloc[0] + 2) / 22
            sst_west_norm = (df_sst_west["Sea_Surface_Temp_C_West"].iloc[0] + 2) / 22

            # Compute Ice Melt Rate
            ice_melt_rate_east = (0.7 * df_sst_east["Melt_Index_East"].iloc[0] + 0.3 * sst_east_norm)
            ice_melt_rate_west = (0.7 * df_sst_west["Melt_Index_West"].iloc[0] + 0.3 * sst_west_norm)

            # Create DataFrame
            df_new = pd.DataFrame({
                "Year": [next_year],
                "Quarter": [next_quarter],
                "Ice_Melt_Rate_East": [ice_melt_rate_east],
                "Ice_Melt_Rate_West": [ice_melt_rate_west],
                "SST_East": [df_sst_east["Sea_Surface_Temp_C_East"].iloc[0]],
                "SST_West": [df_sst_west["Sea_Surface_Temp_C_West"].iloc[0]]
            })

            # Validate
            expected_columns = ["Year", "Quarter", "Ice_Melt_Rate_East", "Ice_Melt_Rate_West", "SST_East", "SST_West"]
            dtypes = {"Year": int, "Quarter": str, "Ice_Melt_Rate_East": float, "Ice_Melt_Rate_West": float, "SST_East": float, "SST_West": float}
            validate_dataframe(df_new, expected_columns, dtypes)

            # Generate DML statements
            dml_statements = []
            for _, row in df_new.iterrows():
                dml_statements.append(
                    f"INSERT INTO ice_melt_sst (Year, Quarter, Ice_Melt_Rate_East, Ice_Melt_Rate_West, SST_East, SST_West) VALUES "
                    f"({row['Year']}, '{row['Quarter']}', {row['Ice_Melt_Rate_East']}, {row['Ice_Melt_Rate_West']}, {row['SST_East']}, {row['SST_West']})"
                )

            # Append DML statements
            with open('/content/drive/MyDrive/dml_populate.sql', 'a') as f:
                f.write("\n-- Update for ice_melt_sst\n")
                f.write("\n".join(dml_statements) + ";\n")

            # Execute DML
            execute_sql_script('/content/drive/MyDrive/dml_populate.sql')
            print("Updated ice_melt_sst table with new data")
        else:
            print("No new data available for ice_melt_sst (requires updated sst_east and sst_west data)")

# Display updated data
print("Updated Ice Melt SST DataFrame:")
df_updated = pd.read_sql_query("SELECT * FROM ice_melt_sst WHERE Year = ? AND Quarter = ?", conn, params=(next_year, next_quarter))
display(df_updated)

In [None]:
# Close the database connection
conn.close()
print("Database connection closed.")