<a href="https://colab.research.google.com/github/ccspen21/greenland-fishery-nowcast-2025/blob/main/setup_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/ccspen21/greenland-fishery-nowcast-2025.git

Cloning into 'greenland-fishery-nowcast-2025'...
remote: Enumerating objects: 37, done.[K
remote: Counting objects: 100% (37/37), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 37 (delta 13), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (37/37), 39.67 KiB | 625.00 KiB/s, done.
Resolving deltas: 100% (13/13), done.


In [2]:
!pip install requests xarray pandas pyjstat datetime pydap netCDF4
import os
import sqlite3
import pandas as pd
import requests
from pyjstat import pyjstat
from urllib.parse import quote
from io import StringIO
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Ensure compatibility with Colab and GitHub
!apt-get update && apt-get install -y iputils-ping

# Define a configurable database path
DB_PATH = os.getenv("DB_PATH", "greenland_fishery.db")  # Use environment variable or default to local file

Collecting pyjstat
  Downloading pyjstat-2.4.0.tar.gz (798 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.0/798.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datetime
  Downloading DateTime-5.5-py3-none-any.whl.metadata (33 kB)
Collecting pydap
  Downloading pydap-3.5.5-py3-none-any.whl.metadata (9.1 kB)
Collecting netCDF4
  Downloading netCDF4-1.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting zope.interface (from datetime)
  Downloading zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m360.5 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests-cache (from pydap)
  Downloading requests_cache-1.2.1-py3-none-any.whl.metadata (9.9 kB)
Collecting Webob (from pydap)
  Downloading WebO

In [3]:
%cd /content/greenland-fishery-nowcast-2025

import sqlite3

# Create a connection to an in-memory SQLite database
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()  # Add this line
print("Connected to SQLite in-memory database")

# Load and execute the DDL script to create the database schema
def execute_sql_script(conn, file_path):
    try:
        with open(file_path, 'r') as f:
            conn.executescript(f.read())
        conn.commit()
        print(f"Successfully executed SQL script: {file_path}")
    except Exception as e:
        print(f"Error executing SQL script {file_path}: {e}")
        raise

# Execute the DDL and DML scripts
execute_sql_script(conn, 'ddl.sql')
execute_sql_script(conn, 'dml_populate.sql')

/content/greenland-fishery-nowcast-2025
Connected to SQLite in-memory database
Successfully executed SQL script: ddl.sql
Successfully executed SQL script: dml_populate.sql


In [4]:
# LOAD VAR 1 TOTAL CATCH
# LOAD VAR 1 TOTAL CATCH

# LOAD VAR 1 TOTAL CATCH

df_clean = None

# Helper function to validate DataFrame against schema
def validate_dataframe(df, expected_columns, dtypes):
    if not all(col in df.columns for col in expected_columns):
        raise ValueError(f"DataFrame missing expected columns: {expected_columns}")
    for col, dtype in dtypes.items():
        if col in df.columns:
            if dtype == int:  # Handle integer conversion with NaN
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
            else:
                df[col] = df[col].astype(dtype)
    if df.isnull().any().any():
        raise ValueError(f"DataFrame contains NaN values: {df.head()}")

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='total_catch'")
if cursor.fetchone():
    print("Loading Total Catch from SQLite database")
    df_clean = pd.read_sql_query("SELECT * FROM total_catch", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Unit", "Total_Catch"]
    dtypes = {"Year": int, "Quarter": str, "Unit": str, "Total_Catch": int}
    try:
        validate_dataframe(df_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE total_catch")
        conn.commit()
        execute_sql_script('ddl_schema.sql')  # Recreate the schema
        df_clean = None

if df_clean is None:
    print("total_catch table not found or invalid, querying API...")
    url = "https://bank.stat.gl:443/api/v1/en/Greenland/FI/FI10/FIX008.px"
    query = {
        "query": [
            {"code": "nation", "selection": {"filter": "item", "values": ["GRL"]}},
            {"code": "unit", "selection": {"filter": "item", "values": ["Ton"]}},
            {"code": "time", "selection": {"filter": "item", "values": [str(y) for y in range(2011, 2025)]}},
            {"code": "quarter", "selection": {"filter": "item", "values": ["1", "2", "3", "4"]}}
        ],
        "response": {"format": "json-stat2"}
    }
    try:
        response = requests.post(url, json=query, timeout=30)
        response.raise_for_status()
        dataset = pyjstat.Dataset.read(response.text)
        df = dataset.write('dataframe')
        print("Data successfully retrieved and converted to DataFrame!")

        # Clean DataFrame
        df_clean = df.copy()
        df_clean.drop(columns=['nation'], inplace=True)
        df_clean.rename(columns={
            "time": "Year",
            "quarter": "Quarter",
            "unit": "Unit",
            "value": "Total_Catch"
        }, inplace=True)
        df_clean["Quarter"] = df_clean["Quarter"].str.replace("Quarter ", "Q")
        quarter_order = ["Q1", "Q2", "Q3", "Q4"]
        df_clean["Quarter"] = pd.Categorical(df_clean["Quarter"], categories=quarter_order, ordered=True)
        df_clean = df_clean[["Year", "Quarter", "Unit", "Total_Catch"]]
        df_clean["Year"] = df_clean["Year"].astype(int)

        # Validate before saving to SQLite
        expected_columns = ["Year", "Quarter", "Unit", "Total_Catch"]
        dtypes = {"Year": int, "Quarter": str, "Unit": str, "Total_Catch": int}
        validate_dataframe(df_clean, expected_columns, dtypes)

        # Insert data into table (schema already created)
        df_clean.to_sql('total_catch', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved Total Catch to SQLite table 'total_catch'")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Total Catch data: {e}")
        df_clean = pd.DataFrame(columns=["Year", "Quarter", "Unit", "Total_Catch"])  # Empty DataFrame as fallback

# Final display
print("Final Total Catch DataFrame:")
display(df_clean.head())

Loading Total Catch from SQLite database
Loaded DataFrame from SQLite:
Loaded data is valid.
Final Total Catch DataFrame:


Unnamed: 0,Year,Quarter,Unit,Total_Catch
0,2011,Q1,Ton,50000
1,2011,Q2,Ton,55000
2,2011,Q3,Ton,60000
3,2011,Q4,Ton,52000


In [5]:
### Variable 2: Exports of Fish

df_fish_clean = None

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='fish_exports'")
if cursor.fetchone():
    print("Loading Fish Exports from SQLite database")
    df_fish_clean = pd.read_sql_query("SELECT * FROM fish_exports", conn)
    print("Loaded DataFrame from SQLite:")
    # Validate the loaded data
    expected_columns = ["Year", "Quarter", "Fish_Export_Value_Million_Kr"]
    dtypes = {"Year": int, "Quarter": str, "Fish_Export_Value_Million_Kr": int}
    try:
        validate_dataframe(df_fish_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE fish_exports")
        conn.commit()
        df_fish_clean = None

if df_fish_clean is None:
    print("fish_exports table not found or invalid, querying API...")
    url = "https://bank.stat.gl:443/api/v1/en/Greenland/BE/BE80/BEXSTA22.px"
    query = {
        "query": [
            {"code": "unit", "selection": {"filter": "item", "values": ["Mill. kr."]}},
            {"code": "time", "selection": {"filter": "item", "values": [str(y) for y in range(2011, 2025)]}},
            {"code": "quarter", "selection": {"filter": "item", "values": ["1", "2", "3", "4"]}}
        ],
        "response": {"format": "json-stat2"}
    }
    try:
        response = requests.post(url, json=query, timeout=30)
        response.raise_for_status()
        dataset = pyjstat.Dataset.read(response.text)
        df = dataset.write('dataframe')
        print("Data successfully retrieved and converted to DataFrame!")

        # Clean DataFrame
        df_fish_clean = df.copy()
        df_fish_clean.rename(columns={
            "time": "Year",
            "quarter": "Quarter",
            "value": "Fish_Export_Value_Million_Kr"
        }, inplace=True)
        df_fish_clean["Quarter"] = df_fish_clean["Quarter"].str.replace("Quarter ", "Q")
        quarter_order = ["Q1", "Q2", "Q3", "Q4"]
        df_fish_clean["Quarter"] = pd.Categorical(df_fish_clean["Quarter"], categories=quarter_order, ordered=True)
        df_fish_clean = df_fish_clean[["Year", "Quarter", "Fish_Export_Value_Million_Kr"]]
        df_fish_clean["Year"] = df_fish_clean["Year"].astype(int)

        # Validate before saving to SQLite
        expected_columns = ["Year", "Quarter", "Fish_Export_Value_Million_Kr"]
        dtypes = {"Year": int, "Quarter": str, "Fish_Export_Value_Million_Kr": int}
        validate_dataframe(df_fish_clean, expected_columns, dtypes)

        # Insert data into table (schema already created)
        df_fish_clean.to_sql('fish_exports', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved Fish Exports to SQLite table 'fish_exports'")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Fish Exports data: {e}")
        df_fish_clean = pd.DataFrame(columns=["Year", "Quarter", "Fish_Export_Value_Million_Kr"])  # Empty DataFrame as fallback

# Final display
print("Final Fish Exports DataFrame:")
display(df_fish_clean.head())

Loading Fish Exports from SQLite database
Loaded DataFrame from SQLite:
Loaded data is valid.
Final Fish Exports DataFrame:


Unnamed: 0,Year,Quarter,Fish_Export_Value_Million_Kr
0,2011,Q1,300
1,2011,Q2,320
2,2011,Q3,350
3,2011,Q4,310


In [6]:
# VARIABLE 3: WEST GREENLAND SST

df_sst_west_clean = None

# Degree to ERDDAP grid index conversion
def deg_to_index_lat(lat): return int(round((lat + 90) / 0.25))
def deg_to_index_lon(lon): return int(round((lon + 180) / 0.25))

# Define bounding box in degrees (consistent with your prior specifications)
bbox_deg = {
    'lat_min': 65.0,
    'lat_max': 70.0,
    'lon_min': -55.0,
    'lon_max': -50.0
}

# Convert to grid indices
bbox_idx = {
    'lat_min': deg_to_index_lat(bbox_deg['lat_min']),
    'lat_max': deg_to_index_lat(bbox_deg['lat_max']),
    'lon_min': deg_to_index_lon(bbox_deg['lon_min']),
    'lon_max': deg_to_index_lon(bbox_deg['lon_max'])
}
print("Bounding box indices:", bbox_idx)

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='sst_west'")
if cursor.fetchone():
    print("Loading SST West from SQLite database")
    df_sst_west_clean = pd.read_sql_query("SELECT * FROM sst_west", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_West", "Melt_Active_West", "Melt_Index_West"]
    dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_West": float, "Melt_Active_West": int, "Melt_Index_West": float}
    try:
        validate_dataframe(df_sst_west_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE sst_west")
        conn.commit()
        execute_sql_script(conn, 'ddl.sql')  # Recreate the schema
        df_sst_west_clean = None

if df_sst_west_clean is None:
    print("sst_west table not found or invalid, querying API...")
    years = list(range(2011, 2025))
    west_quarters = []
    for year in years:
        print(f"Processing year: {year}")
        try:
            base = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?"
            var = "sst"
            time = f"[({year}-01-01T00:00:00Z):1:({year}-12-31T00:00:00Z)]".replace(" ", "")
            zlev = "[0:1:0]"
            lat = f"[({bbox_idx['lat_min']}):1:({bbox_idx['lat_max']})]"
            lon = f"[({bbox_idx['lon_min']}):1:({bbox_idx['lon_max']})]"
            query = f"{var}{time}{zlev}{lat}{lon}"
            full_url = base + quote(query, safe=":/[](),-T")  # Ensure 'T' is included, as per your past issue
            print("Constructed URL:", full_url)

            response = requests.get(full_url, timeout=30)
            response.raise_for_status()

            df = pd.read_csv(StringIO(response.text), skiprows=[1])
            df = df.rename(columns={col: col.strip() for col in df.columns})
            df = df.dropna(subset=["sst"])

            df["time"] = pd.to_datetime(df["time"])
            df["Year"] = df["time"].dt.year.astype(int)
            df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)

            df_q = df.groupby(['Year', 'Quarter'])["sst"].mean().reset_index()
            df_q = df_q.rename(columns={"sst": "Sea_Surface_Temp_C_West"})

            df_q["Melt_Active_West"] = (df_q["Sea_Surface_Temp_C_West"] > 0.5).astype(int)
            df_q["Melt_Index_West"] = df_q["Sea_Surface_Temp_C_West"].clip(lower=0, upper=4) / 4

            west_quarters.append(df_q)
            print(f"{year} processed.")
        except requests.exceptions.RequestException as e:
            print(f"Failed for {year}: {e}")
            continue  # Continue to the next year instead of failing completely

    if west_quarters:
        df_sst_west_clean = pd.concat(west_quarters).reset_index(drop=True)
        expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_West", "Melt_Active_West", "Melt_Index_West"]
        dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_West": float, "Melt_Active_West": int, "Melt_Index_West": float}
        validate_dataframe(df_sst_west_clean, expected_columns, dtypes)

        df_sst_west_clean.to_sql('sst_west', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved SST West to SQLite table 'sst_west'")
    else:
        print("No data retrieved for SST West.")
        df_sst_west_clean = pd.DataFrame(columns=["Year", "Quarter", "Sea_Surface_Temp_C_West", "Melt_Active_West", "Melt_Index_West"])

# Final display
print("Final SST West DataFrame:")
if df_sst_west_clean is not None:
    print("Final SST dataset shape:", df_sst_west_clean.shape)
    display(df_sst_west_clean.head())
else:
    print("Error: df_sst_west_clean not created due to API failure.")

Bounding box indices: {'lat_min': 620, 'lat_max': 640, 'lon_min': 500, 'lon_max': 520}
Loading SST West from SQLite database
Loaded DataFrame from SQLite:
Loaded data is valid.
Final SST West DataFrame:
Final SST dataset shape: (4, 5)


Unnamed: 0,Year,Quarter,Sea_Surface_Temp_C_West,Melt_Active_West,Melt_Index_West
0,2011,Q1,0.2,0,0.05
1,2011,Q2,1.0,1,0.25
2,2011,Q3,2.5,1,0.625
3,2011,Q4,0.8,1,0.2


In [7]:
# Variable 4: East Greenland SST, Melt

# VARIABLE 4: EAST GREENLAND SST

df_sst_east_clean = None

# Degree to ERDDAP grid index conversion
def deg_to_index_lat(lat): return int(round((lat + 90) / 0.25))
def deg_to_index_lon(lon): return int(round((lon + 180) / 0.25))

# Define bounding box in degrees for East Greenland
bbox_deg_east = {
    'lat_min': 65.0,
    'lat_max': 70.0,
    'lon_min': -40.0,
    'lon_max': -35.0
}

# Convert to grid indices
bbox_idx_east = {
    'lat_min': deg_to_index_lat(bbox_deg_east['lat_min']),
    'lat_max': deg_to_index_lat(bbox_deg_east['lat_max']),
    'lon_min': deg_to_index_lon(bbox_deg_east['lon_min']),
    'lon_max': deg_to_index_lon(bbox_deg_east['lon_max'])
}
print("East Greenland bounding box indices:", bbox_idx_east)

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='sst_east'")
if cursor.fetchone():
    print("Loading SST East from SQLite database")
    df_sst_east_clean = pd.read_sql_query("SELECT * FROM sst_east", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_East", "Melt_Active_East", "Melt_Index_East"]
    dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_East": float, "Melt_Active_East": int, "Melt_Index_East": float}
    try:
        validate_dataframe(df_sst_east_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE sst_east")
        conn.commit()
        execute_sql_script(conn, 'ddl.sql')  # Recreate the schema
        df_sst_east_clean = None

if df_sst_east_clean is None:
    print("sst_east table not found or invalid, querying API...")
    years = list(range(2011, 2025))
    east_quarters = []
    for year in years:
        print(f"Processing year: {year}")
        try:
            base = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?"
            var = "sst"
            time = f"[({year}-01-01T00:00:00Z):1:({year}-12-31T00:00:00Z)]".replace(" ", "")
            zlev = "[0:1:0]"
            lat = f"[({bbox_idx_east['lat_min']}):1:({bbox_idx_east['lat_max']})]"
            lon = f"[({bbox_idx_east['lon_min']}):1:({bbox_idx_east['lon_max']})]"
            query = f"{var}{time}{zlev}{lat}{lon}"
            full_url = base + quote(query, safe=":/[](),-T")
            print("Constructed URL:", full_url)

            response = requests.get(full_url, timeout=30)
            response.raise_for_status()

            df = pd.read_csv(StringIO(response.text), skiprows=[1])
            df = df.rename(columns={col: col.strip() for col in df.columns})
            df = df.dropna(subset=["sst"])

            df["time"] = pd.to_datetime(df["time"])
            df["Year"] = df["time"].dt.year.astype(int)
            df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)

            df_q = df.groupby(['Year', 'Quarter'])["sst"].mean().reset_index()
            df_q = df_q.rename(columns={"sst": "Sea_Surface_Temp_C_East"})

            df_q["Melt_Active_East"] = (df_q["Sea_Surface_Temp_C_East"] > 0.5).astype(int)
            df_q["Melt_Index_East"] = df_q["Sea_Surface_Temp_C_East"].clip(lower=0, upper=4) / 4

            east_quarters.append(df_q)
            print(f"{year} processed.")
        except requests.exceptions.RequestException as e:
            print(f"Failed for {year}: {e}")
            continue

    if east_quarters:
        df_sst_east_clean = pd.concat(east_quarters).reset_index(drop=True)
        expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_East", "Melt_Active_East", "Melt_Index_East"]
        dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_East": float, "Melt_Active_East": int, "Melt_Index_East": float}
        validate_dataframe(df_sst_east_clean, expected_columns, dtypes)

        df_sst_east_clean.to_sql('sst_east', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved SST East to SQLite table 'sst_east'")
    else:
        print("No data retrieved for SST East.")
        df_sst_east_clean = pd.DataFrame(columns=["Year", "Quarter", "Sea_Surface_Temp_C_East", "Melt_Active_East", "Melt_Index_East"])

# Final display
print("Final SST East DataFrame:")
if df_sst_east_clean is not None:
    print("Final SST East dataset shape:", df_sst_east_clean.shape)
    display(df_sst_east_clean.head())
else:
    print("Error: df_sst_east_clean not created due to API failure.")

East Greenland bounding box indices: {'lat_min': 620, 'lat_max': 640, 'lon_min': 560, 'lon_max': 580}
Loading SST East from SQLite database
Loaded DataFrame from SQLite:
Loaded data is valid.
Final SST East DataFrame:
Final SST East dataset shape: (4, 5)


Unnamed: 0,Year,Quarter,Sea_Surface_Temp_C_East,Melt_Active_East,Melt_Index_East
0,2011,Q1,0.3,0,0.075
1,2011,Q2,1.2,1,0.3
2,2011,Q3,2.8,1,0.7
3,2011,Q4,0.9,1,0.225


In [8]:
# Variable 5: South Greenland SST, Melt

# VARIABLE 5: SOUTH GREENLAND SST

df_sst_south_clean = None

# Degree to ERDDAP grid index conversion
def deg_to_index_lat(lat): return int(round((lat + 90) / 0.25))
def deg_to_index_lon(lon): return int(round((lon + 180) / 0.25))

# Define bounding box in degrees for South Greenland
bbox_deg_south = {
    'lat_min': 60.0,
    'lat_max': 65.0,
    'lon_min': -45.0,
    'lon_max': -40.0
}

# Convert to grid indices
bbox_idx_south = {
    'lat_min': deg_to_index_lat(bbox_deg_south['lat_min']),
    'lat_max': deg_to_index_lat(bbox_deg_south['lat_max']),
    'lon_min': deg_to_index_lon(bbox_deg_south['lon_min']),
    'lon_max': deg_to_index_lon(bbox_deg_south['lon_max'])
}
print("South Greenland bounding box indices:", bbox_idx_south)

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='sst_south'")
if cursor.fetchone():
    print("Loading SST South from SQLite database")
    df_sst_south_clean = pd.read_sql_query("SELECT * FROM sst_south", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_South", "Melt_Active_South", "Melt_Index_South"]
    dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_South": float, "Melt_Active_South": int, "Melt_Index_South": float}
    try:
        validate_dataframe(df_sst_south_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE sst_south")
        conn.commit()
        execute_sql_script(conn, 'ddl.sql')  # Recreate the schema
        df_sst_south_clean = None

if df_sst_south_clean is None:
    print("sst_south table not found or invalid, querying API...")
    years = list(range(2011, 2025))
    south_quarters = []
    for year in years:
        print(f"Processing year: {year}")
        try:
            base = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?"
            var = "sst"
            time = f"[({year}-01-01T00:00:00Z):1:({year}-12-31T00:00:00Z)]".replace(" ", "")
            zlev = "[0:1:0]"
            lat = f"[({bbox_idx_south['lat_min']}):1:({bbox_idx_south['lat_max']})]"
            lon = f"[({bbox_idx_south['lon_min']}):1:({bbox_idx_south['lon_max']})]"
            query = f"{var}{time}{zlev}{lat}{lon}"
            full_url = base + quote(query, safe=":/[](),-T")
            print("Constructed URL:", full_url)

            response = requests.get(full_url, timeout=30)
            response.raise_for_status()

            df = pd.read_csv(StringIO(response.text), skiprows=[1])
            df = df.rename(columns={col: col.strip() for col in df.columns})
            df = df.dropna(subset=["sst"])

            df["time"] = pd.to_datetime(df["time"])
            df["Year"] = df["time"].dt.year.astype(int)
            df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)

            df_q = df.groupby(['Year', 'Quarter'])["sst"].mean().reset_index()
            df_q = df_q.rename(columns={"sst": "Sea_Surface_Temp_C_South"})

            df_q["Melt_Active_South"] = (df_q["Sea_Surface_Temp_C_South"] > 0.5).astype(int)
            df_q["Melt_Index_South"] = df_q["Sea_Surface_Temp_C_South"].clip(lower=0, upper=4) / 4

            south_quarters.append(df_q)
            print(f"{year} processed.")
        except requests.exceptions.RequestException as e:
            print(f"Failed for {year}: {e}")
            continue

    if south_quarters:
        df_sst_south_clean = pd.concat(south_quarters).reset_index(drop=True)
        expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_South", "Melt_Active_South", "Melt_Index_South"]
        dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_South": float, "Melt_Active_South": int, "Melt_Index_South": float}
        validate_dataframe(df_sst_south_clean, expected_columns, dtypes)

        df_sst_south_clean.to_sql('sst_south', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved SST South to SQLite table 'sst_south'")
    else:
        print("No data retrieved for SST South.")
        df_sst_south_clean = pd.DataFrame(columns=["Year", "Quarter", "Sea_Surface_Temp_C_South", "Melt_Active_South", "Melt_Index_South"])

# Final display
print("Final SST South DataFrame:")
if df_sst_south_clean is not None:
    print("Final SST South dataset shape:", df_sst_south_clean.shape)
    display(df_sst_south_clean.head())
else:
    print("Error: df_sst_south_clean not created due to API failure.")

South Greenland bounding box indices: {'lat_min': 600, 'lat_max': 620, 'lon_min': 540, 'lon_max': 560}
Loading SST South from SQLite database
Loaded DataFrame from SQLite:
Loaded data is valid.
Final SST South DataFrame:
Final SST South dataset shape: (4, 5)


Unnamed: 0,Year,Quarter,Sea_Surface_Temp_C_South,Melt_Active_South,Melt_Index_South
0,2011,Q1,0.5,0,0.125
1,2011,Q2,1.5,1,0.375
2,2011,Q3,3.0,1,0.75
3,2011,Q4,1.0,1,0.25


In [9]:
#Variable 6: Total Catch by Foreign Vessels

# VARIABLE 6: FOREIGN CATCH

df_foreign_clean = None

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='foreign_catch'")
if cursor.fetchone():
    print("Loading Foreign Catch from SQLite database")
    df_foreign_clean = pd.read_sql_query("SELECT * FROM foreign_catch", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Unit", "Foreign_Catch"]
    dtypes = {"Year": int, "Quarter": str, "Unit": str, "Foreign_Catch": int}
    try:
        validate_dataframe(df_foreign_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE foreign_catch")
        conn.commit()
        execute_sql_script(conn, 'ddl.sql')  # Recreate the schema
        df_foreign_clean = None

if df_foreign_clean is None:
    print("foreign_catch table not found or invalid, querying API...")
    url = "https://bank.stat.gl:443/api/v1/en/Greenland/FI/FI10/FIX008.px"
    query = {
        "query": [
            {"code": "nation", "selection": {"filter": "item", "values": ["Foreign"]}},
            {"code": "unit", "selection": {"filter": "item", "values": ["Ton"]}},
            {"code": "time", "selection": {"filter": "item", "values": [str(y) for y in range(2011, 2025)]}},
            {"code": "quarter", "selection": {"filter": "item", "values": ["1", "2", "3", "4"]}}
        ],
        "response": {"format": "json-stat2"}
    }
    try:
        response = requests.post(url, json=query, timeout=30)
        response.raise_for_status()
        dataset = pyjstat.Dataset.read(response.text)
        df = dataset.write('dataframe')
        print("Data successfully retrieved and converted to DataFrame!")

        # Clean DataFrame
        df_foreign_clean = df.copy()
        df_foreign_clean.drop(columns=['nation'], inplace=True)
        df_foreign_clean.rename(columns={
            "time": "Year",
            "quarter": "Quarter",
            "unit": "Unit",
            "value": "Foreign_Catch"
        }, inplace=True)
        df_foreign_clean["Quarter"] = df_foreign_clean["Quarter"].str.replace("Quarter ", "Q")
        quarter_order = ["Q1", "Q2", "Q3", "Q4"]
        df_foreign_clean["Quarter"] = pd.Categorical(df_foreign_clean["Quarter"], categories=quarter_order, ordered=True)
        df_foreign_clean = df_foreign_clean[["Year", "Quarter", "Unit", "Foreign_Catch"]]
        df_foreign_clean["Year"] = df_foreign_clean["Year"].astype(int)

        # Validate before saving to SQLite
        expected_columns = ["Year", "Quarter", "Unit", "Foreign_Catch"]
        dtypes = {"Year": int, "Quarter": str, "Unit": str, "Foreign_Catch": int}
        validate_dataframe(df_foreign_clean, expected_columns, dtypes)

        # Insert data into table (schema already created)
        df_foreign_clean.to_sql('foreign_catch', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved Foreign Catch to SQLite table 'foreign_catch'")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Foreign Catch data: {e}")
        df_foreign_clean = pd.DataFrame(columns=["Year", "Quarter", "Unit", "Foreign_Catch"])  # Empty DataFrame as fallback

# Final display
print("Final Foreign Catch DataFrame:")
display(df_foreign_clean.head())

Loading Foreign Catch from SQLite database
Loaded DataFrame from SQLite:
Loaded data is valid.
Final Foreign Catch DataFrame:


Unnamed: 0,Year,Quarter,Unit,Foreign_Catch
0,2011,Q1,Ton,10000
1,2011,Q2,Ton,12000
2,2011,Q3,Ton,15000
3,2011,Q4,Ton,11000


In [10]:
#save in memory database to a file

import sqlite3

# Create a new on-disk database
disk_conn = sqlite3.connect('/content/greenland_fishery.db')

# Backup the in-memory database to the on-disk database
with disk_conn:
    conn.backup(disk_conn)

# Close the on-disk connection
disk_conn.close()

print("In-memory database saved to /content/greenland_fishery.db")

In-memory database saved to /content/greenland_fishery.db


In [12]:
from google.colab import drive
drive.mount('/content/drive')

# Copy the database file to Google Drive
!cp /content/greenland_fishery.db /content/drive/MyDrive/greenland_fishery.db
print("Database saved to Google Drive at /content/drive/MyDrive/greenland_fishery.db")

Mounted at /content/drive
Database saved to Google Drive at /content/drive/MyDrive/greenland_fishery.db
