<a href="https://colab.research.google.com/github/ccspen21/greenland-fishery-nowcast-2025/blob/main/setup_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
!git clone https://github.com/ccspen21/greenland-fishery-nowcast-2025.git

fatal: destination path 'greenland-fishery-nowcast-2025' already exists and is not an empty directory.


In [49]:
!pip install requests xarray pandas pyjstat datetime pydap netCDF4
import os
import sqlite3
import pandas as pd
import requests
import time as time_module  # Rename to avoid shadowing
from pyjstat import pyjstat
from urllib.parse import quote
from io import StringIO
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Ensure compatibility with Colab and GitHub
!apt-get update && apt-get install -y iputils-ping

# Define a configurable database path
DB_PATH = os.getenv("DB_PATH", "greenland_fishery.db")

# Helper function to validate DataFrame against schema
def validate_dataframe(df, expected_columns, dtypes):
    if df.empty:
        raise ValueError("DataFrame is empty, no rows found.")
    if not all(col in df.columns for col in expected_columns):
        raise ValueError(f"DataFrame missing expected columns: {expected_columns}")
    for col, dtype in dtypes.items():
        if col in df.columns:
            if dtype == int:
                df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
            else:
                df[col] = df[col].astype(dtype)
    if df.isnull().any().any():
        raise ValueError(f"DataFrame contains NaN values: {df.head()}")

# Helper function for API calls with retries and exponential backoff
def fetch_with_retries(url, max_retries=3, timeout=60, method='get', json=None):
    for attempt in range(max_retries):
        try:
            if method == 'get':
                response = requests.get(url, timeout=timeout)
            else:
                response = requests.post(url, json=json, timeout=timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt+1} failed: {e}")
            if attempt + 1 == max_retries:
                raise
            time_module.sleep(2 ** attempt)  # Use time_module to avoid shadowing

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Fetched 257 kB in 1s (203 kB/s)
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package list

In [50]:
%cd /content/greenland-fishery-nowcast-2025

import sqlite3

# Create a connection to an in-memory SQLite database
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()
print("Connected to SQLite in-memory database")

# Load and execute the DDL script to create the database schema
def execute_sql_script(conn, file_path):
    try:
        with open(file_path, 'r') as f:
            conn.executescript(f.read())
        conn.commit()
        print(f"Successfully executed SQL script: {file_path}")
    except Exception as e:
        print(f"Error executing SQL script {file_path}: {e}")
        raise

# Execute only the DDL script (remove DML population)
execute_sql_script(conn, 'ddl.sql')
# Remove this line:
# execute_sql_script(conn, 'dml_populate.sql')

/content/greenland-fishery-nowcast-2025
Connected to SQLite in-memory database
Successfully executed SQL script: ddl.sql


In [51]:
# LOAD VAR 1 TOTAL CATCH

df_clean = None

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='total_catch'")
if cursor.fetchone():
    print("Loading Total Catch from SQLite database")
    df_clean = pd.read_sql_query("SELECT * FROM total_catch", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Unit", "Total_Catch"]
    dtypes = {"Year": int, "Quarter": str, "Unit": str, "Total_Catch": int}
    try:
        validate_dataframe(df_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE total_catch")
        conn.commit()
        # Recreate the table
        cursor.execute("""
            CREATE TABLE total_catch (
                Year INTEGER NOT NULL,
                Quarter TEXT NOT NULL,
                Unit TEXT,
                Total_Catch INTEGER,
                PRIMARY KEY (Year, Quarter)
            )
        """)
        conn.commit()
        df_clean = None

if df_clean is None:
    print("total_catch table not found or invalid, querying API...")
    url = "https://bank.stat.gl:443/api/v1/en/Greenland/FI/FI10/FIX008.px"
    query = {
        "query": [
            {"code": "nation", "selection": {"filter": "item", "values": ["GRL"]}},
            {"code": "unit", "selection": {"filter": "item", "values": ["Ton"]}},
            {"code": "time", "selection": {"filter": "item", "values": [str(y) for y in range(2011, 2025)]}},
            {"code": "quarter", "selection": {"filter": "item", "values": ["1", "2", "3", "4"]}}
        ],
        "response": {"format": "json-stat2"}
    }
    try:
        response = fetch_with_retries(url, max_retries=3, timeout=60, method='post', json=query)
        dataset = pyjstat.Dataset.read(response.text)
        df = dataset.write('dataframe')
        print("Data successfully retrieved and converted to DataFrame!")

        # Clean DataFrame
        df_clean = df.copy()
        df_clean.drop(columns=['nation'], inplace=True)
        df_clean.rename(columns={
            "time": "Year",
            "quarter": "Quarter",
            "unit": "Unit",
            "value": "Total_Catch"
        }, inplace=True)
        df_clean["Quarter"] = df_clean["Quarter"].str.replace("Quarter ", "Q")
        quarter_order = ["Q1", "Q2", "Q3", "Q4"]
        df_clean["Quarter"] = pd.Categorical(df_clean["Quarter"], categories=quarter_order, ordered=True)
        df_clean = df_clean[["Year", "Quarter", "Unit", "Total_Catch"]]
        df_clean["Year"] = df_clean["Year"].astype(int)

        # Validate before saving to SQLite
        expected_columns = ["Year", "Quarter", "Unit", "Total_Catch"]
        dtypes = {"Year": int, "Quarter": str, "Unit": str, "Total_Catch": int}
        validate_dataframe(df_clean, expected_columns, dtypes)

        # Insert data into table
        df_clean.to_sql('total_catch', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved Total Catch to SQLite table 'total_catch'")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Total Catch data: {e}")
        df_clean = pd.DataFrame(columns=["Year", "Quarter", "Unit", "Total_Catch"])  # Empty DataFrame as fallback

# Final display
print("Final Total Catch DataFrame:")
display(df_clean)

Loading Total Catch from SQLite database
Loaded DataFrame from SQLite:
Validation error: DataFrame is empty, no rows found.. Dropping and recreating table...
total_catch table not found or invalid, querying API...
Data successfully retrieved and converted to DataFrame!
Saved Total Catch to SQLite table 'total_catch'
Final Total Catch DataFrame:


Unnamed: 0,Year,Quarter,Unit,Total_Catch
0,2011,Q1,Tonnes,25637
1,2011,Q2,Tonnes,16922
2,2011,Q3,Tonnes,34917
3,2011,Q4,Tonnes,24504
4,2012,Q1,Tonnes,46621
5,2012,Q2,Tonnes,16689
6,2012,Q3,Tonnes,25552
7,2012,Q4,Tonnes,19599
8,2013,Q1,Tonnes,48012
9,2013,Q2,Tonnes,17395


In [52]:
### Variable 2: Exports of Fish

df_fish_clean = None

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='fish_exports'")
if cursor.fetchone():
    print("Loading Fish Exports from SQLite database")
    df_fish_clean = pd.read_sql_query("SELECT * FROM fish_exports", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Fish_Export_Value_Million_Kr"]
    dtypes = {"Year": int, "Quarter": str, "Fish_Export_Value_Million_Kr": int}
    try:
        validate_dataframe(df_fish_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE fish_exports")
        conn.commit()
        df_fish_clean = None

if df_fish_clean is None:
    print("fish_exports table not found or invalid, querying API...")
    url = "https://bank.stat.gl:443/api/v1/en/Greenland/IE/IE10/IEX2PROD.px"
    query = {
        "query": [
            {"code": "branch", "selection": {"filter": "item", "values": ["46"]}},
            {"code": "quarter", "selection": {"filter": "item", "values": ["1", "2", "3", "4"]}},
            {"code": "time", "selection": {"filter": "item", "values": [str(y) for y in range(2011, 2025)]}}
        ],
        "response": {"format": "json-stat2"}
    }
    try:
        response = fetch_with_retries(url, max_retries=3, timeout=60, method='post', json=query)
        dataset = pyjstat.Dataset.read(response.text)
        df_fish_exports = dataset.write('dataframe')
        print("Fish export data successfully retrieved!")

        # Clean DataFrame
        df_fish_clean = df_fish_exports.copy()
        column_mapping = {
            "time": "Year",
            "quarter": "Quarter",
            "value": "Fish_Export_Value_Million_Kr"
        }
        if "quarter" not in df_fish_exports.columns and "Quarter" in df_fish_exports.columns:
            column_mapping["Quarter"] = "Quarter"
            del column_mapping["quarter"]
        df_fish_clean.rename(columns=column_mapping, inplace=True)

        if "Quarter" not in df_fish_clean.columns:
            raise ValueError("Quarter column missing after renaming.")
        if df_fish_clean["Quarter"].isnull().any():
            raise ValueError("Quarter column contains NaN values after renaming: " + str(df_fish_clean["Quarter"].head()))

        # Fix quarters (case-insensitive replacement)
        df_fish_clean["Quarter"] = df_fish_clean["Quarter"].str.replace(r"[Qq]uarter ", "Q", regex=True)
        quarter_order = ["Q1", "Q2", "Q3", "Q4"]
        df_fish_clean["Quarter"] = pd.Categorical(df_fish_clean["Quarter"], categories=quarter_order, ordered=True)
        df_fish_clean = df_fish_clean.sort_values(by=["Year", "Quarter"]).reset_index(drop=True)

        if df_fish_clean["Quarter"].isnull().any():
            raise ValueError("Quarter column contains NaN values after transformation: " + str(df_fish_clean["Quarter"].head()))

        # Convert export value to million Kr and round
        df_fish_clean["Fish_Export_Value_Million_Kr"] = df_fish_clean["Fish_Export_Value_Million_Kr"] / 1e6
        df_fish_clean["Fish_Export_Value_Million_Kr"] = df_fish_clean["Fish_Export_Value_Million_Kr"].round(0).astype(int)
        df_fish_clean = df_fish_clean[["Year", "Quarter", "Fish_Export_Value_Million_Kr"]]
        df_fish_clean["Year"] = df_fish_clean["Year"].astype(int)

        # Validate before saving to SQLite
        expected_columns = ["Year", "Quarter", "Fish_Export_Value_Million_Kr"]
        dtypes = {"Year": int, "Quarter": str, "Fish_Export_Value_Million_Kr": int}
        validate_dataframe(df_fish_clean, expected_columns, dtypes)

        # Create table with primary key constraint
        cursor.execute("""
            CREATE TABLE fish_exports (
                Year INTEGER NOT NULL,
                Quarter TEXT NOT NULL,
                Fish_Export_Value_Million_Kr INTEGER,
                PRIMARY KEY (Year, Quarter)
            )
        """)

        # Insert data into table
        df_fish_clean.to_sql('fish_exports', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved Fish Exports to SQLite table 'fish_exports'")
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve data from API: {e}")
        df_fish_clean = pd.DataFrame(columns=["Year", "Quarter", "Fish_Export_Value_Million_Kr"])  # Empty DataFrame as fallback

# Final display
print("Final Fish Exports DataFrame:")
display(df_fish_clean.head())

Loading Fish Exports from SQLite database
Loaded DataFrame from SQLite:
Validation error: DataFrame is empty, no rows found.. Dropping and recreating table...
fish_exports table not found or invalid, querying API...
Fish export data successfully retrieved!
Saved Fish Exports to SQLite table 'fish_exports'
Final Fish Exports DataFrame:


Unnamed: 0,Year,Quarter,Fish_Export_Value_Million_Kr
0,2011,Q1,149
1,2011,Q2,174
2,2011,Q3,181
3,2011,Q4,192
4,2012,Q1,133


In [53]:
# VARIABLE 3: WEST GREENLAND SST

df_sst_west_clean = None

# Degree to ERDDAP grid index conversion
def deg_to_index_lat(lat): return lat  # Direct degrees for new dataset
def deg_to_index_lon(lon): return lon  # Direct degrees for new dataset

# Define bounding box in degrees
bbox_deg = {
    'lat_min': 65.0,
    'lat_max': 70.0,
    'lon_min': -55.0,
    'lon_max': -50.0
}

# Use degrees directly
bbox_idx = {
    'lat_min': bbox_deg['lat_min'],
    'lat_max': bbox_deg['lat_max'],
    'lon_min': bbox_deg['lon_min'],
    'lon_max': bbox_deg['lon_max']
}
print("Bounding box indices:", bbox_idx)

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='sst_west'")
if cursor.fetchone():
    print("Loading SST West from SQLite database")
    df_sst_west_clean = pd.read_sql_query("SELECT * FROM sst_west", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_West", "Melt_Active_West", "Melt_Index_West"]
    dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_West": float, "Melt_Active_West": int, "Melt_Index_West": float}
    try:
        validate_dataframe(df_sst_west_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE sst_west")
        conn.commit()
        cursor.execute("""
            CREATE TABLE sst_west (
                Year INTEGER NOT NULL,
                Quarter TEXT NOT NULL,
                Sea_Surface_Temp_C_West REAL,
                Melt_Active_West INTEGER,
                Melt_Index_West REAL,
                PRIMARY KEY (Year, Quarter)
            )
        """)
        conn.commit()
        df_sst_west_clean = None

if df_sst_west_clean is None:
    print("sst_west table not found or invalid, querying API...")
    years = list(range(2011, 2025))
    west_quarters = []
    for year in years:
        print(f"Processing year: {year}")
        try:
            base = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?"
            var = "sst"
            time = f"[({year}-01-01T00:00:00Z):1:({year}-12-31T00:00:00Z)]".replace(" ", "")
            zlev = "[0:1:0]"
            lat = f"[({bbox_idx['lat_min']}):1:({bbox_idx['lat_max']})]"
            lon = f"[({bbox_idx['lon_min']}):1:({bbox_idx['lon_max']})]"
            query = f"{var}{time}{zlev}{lat}{lon}"
            full_url = base + quote(query, safe=":/[](),-T")
            print("Constructed URL:", full_url)

            response = requests.get(full_url)
            if response.status_code != 200:
                raise ValueError(f"HTTP {response.status_code}: {response.text}")

            df = pd.read_csv(StringIO(response.text), skiprows=[1])
            df = df.rename(columns={col: col.strip() for col in df.columns})
            df = df.dropna(subset=["sst"])

            df["time"] = pd.to_datetime(df["time"])
            df["Year"] = df["time"].dt.year.astype(int)
            df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)

            df_q = df.groupby(['Year', 'Quarter'])["sst"].mean().reset_index()
            df_q = df_q.rename(columns={"sst": "Sea_Surface_Temp_C_West"})

            df_q["Melt_Active_West"] = (df_q["Sea_Surface_Temp_C_West"] > 0.5).astype(int)
            df_q["Melt_Index_West"] = df_q["Sea_Surface_Temp_C_West"].clip(lower=0, upper=4) / 4

            west_quarters.append(df_q)
            print(f"{year} processed.")
        except Exception as e:
            print(f"Failed for {year}: {e}")

    if west_quarters:
        df_sst_west_clean = pd.concat(west_quarters).reset_index(drop=True)
        expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_West", "Melt_Active_West", "Melt_Index_West"]
        dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_West": float, "Melt_Active_West": int, "Melt_Index_West": float}
        validate_dataframe(df_sst_west_clean, expected_columns, dtypes)

        df_sst_west_clean.to_sql('sst_west', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved SST West to SQLite table 'sst_west'")
    else:
        print("No data retrieved for SST West.")
        df_sst_west_clean = pd.DataFrame(columns=["Year", "Quarter", "Sea_Surface_Temp_C_West", "Melt_Active_West", "Melt_Index_West"])

# Final display
print("Final SST West DataFrame:")
if df_sst_west_clean is not None:
    print("Final SST West dataset shape:", df_sst_west_clean.shape)
    display(df_sst_west_clean.head())
else:
    print("Error: df_sst_west_clean not created due to API failure.")

Bounding box indices: {'lat_min': 65.0, 'lat_max': 70.0, 'lon_min': -55.0, 'lon_max': -50.0}
Loading SST West from SQLite database
Loaded DataFrame from SQLite:
Validation error: DataFrame is empty, no rows found.. Dropping and recreating table...
sst_west table not found or invalid, querying API...
Processing year: 2011
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2011-01-01T00:00:00Z):1:(2011-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-55.0):1:(-50.0)]
2011 processed.
Processing year: 2012
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2012-01-01T00:00:00Z):1:(2012-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-55.0):1:(-50.0)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["time"] = pd.to_datetime(df["time"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"] = df["time"].dt.year.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)


2012 processed.
Processing year: 2013
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2013-01-01T00:00:00Z):1:(2013-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-55.0):1:(-50.0)]
2013 processed.
Processing year: 2014
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2014-01-01T00:00:00Z):1:(2014-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-55.0):1:(-50.0)]
2014 processed.
Processing year: 2015
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2015-01-01T00:00:00Z):1:(2015-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-55.0):1:(-50.0)]
2015 processed.
Processing year: 2016
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2016-01-01T00:00:00Z):1:(2016-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-55.0):1:(-50.0)]
2016 processed.
Processing year: 2017
Constructed URL: https://coastwatch.pfeg.noaa.gov/erdd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["time"] = pd.to_datetime(df["time"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"] = df["time"].dt.year.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)


2023 processed.
Processing year: 2024
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2024-01-01T00:00:00Z):1:(2024-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-55.0):1:(-50.0)]
2024 processed.
Saved SST West to SQLite table 'sst_west'
Final SST West DataFrame:
Final SST West dataset shape: (56, 5)


Unnamed: 0,Year,Quarter,Sea_Surface_Temp_C_West,Melt_Active_West,Melt_Index_West
0,2011,Q1,0.342774,0,0.085694
1,2011,Q2,1.515407,1,0.378852
2,2011,Q3,5.736561,1,1.0
3,2011,Q4,1.325243,1,0.331311
4,2012,Q1,-0.368894,0,0.0


In [54]:
# Variable 4: East Greenland SST, Melt

df_sst_east_clean = None

# Degree to ERDDAP grid index conversion
def deg_to_index_lat(lat): return lat  # Direct degrees for new dataset
def deg_to_index_lon(lon): return lon  # Direct degrees for new dataset

# Define bounding box in degrees for East Greenland
bbox_deg_east = {
    'lat_min': 65.0,
    'lat_max': 70.0,
    'lon_min': -40.0,
    'lon_max': -35.0
}

# Use degrees directly
bbox_idx_east = {
    'lat_min': bbox_deg_east['lat_min'],
    'lat_max': bbox_deg_east['lat_max'],
    'lon_min': bbox_deg_east['lon_min'],
    'lon_max': bbox_deg_east['lon_max']
}
print("East Greenland bounding box indices:", bbox_idx_east)

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='sst_east'")
if cursor.fetchone():
    print("Loading SST East from SQLite database")
    df_sst_east_clean = pd.read_sql_query("SELECT * FROM sst_east", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_East", "Melt_Active_East", "Melt_Index_East"]
    dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_East": float, "Melt_Active_East": int, "Melt_Index_East": float}
    try:
        validate_dataframe(df_sst_east_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE sst_east")
        conn.commit()
        cursor.execute("""
            CREATE TABLE sst_east (
                Year INTEGER NOT NULL,
                Quarter TEXT NOT NULL,
                Sea_Surface_Temp_C_East REAL,
                Melt_Active_East INTEGER,
                Melt_Index_East REAL,
                PRIMARY KEY (Year, Quarter)
            )
        """)
        conn.commit()
        df_sst_east_clean = None

if df_sst_east_clean is None:
    print("sst_east table not found or invalid, querying API...")
    years = list(range(2011, 2025))
    east_quarters = []
    for year in years:
        print(f"Processing year: {year}")
        try:
            base = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?"
            var = "sst"
            time = f"[({year}-01-01T00:00:00Z):1:({year}-12-31T00:00:00Z)]".replace(" ", "")
            zlev = "[0:1:0]"
            lat = f"[({bbox_idx_east['lat_min']}):1:({bbox_idx_east['lat_max']})]"
            lon = f"[({bbox_idx_east['lon_min']}):1:({bbox_idx_east['lon_max']})]"
            query = f"{var}{time}{zlev}{lat}{lon}"
            full_url = base + quote(query, safe=":/[](),-T")
            print("Constructed URL:", full_url)

            response = fetch_with_retries(full_url, max_retries=3, timeout=60)
            if response.status_code != 200:
                raise ValueError(f"HTTP {response.status_code}: {response.text}")

            df = pd.read_csv(StringIO(response.text), skiprows=[1])
            df = df.rename(columns={col: col.strip() for col in df.columns})
            df = df.dropna(subset=["sst"])

            df["time"] = pd.to_datetime(df["time"])
            df["Year"] = df["time"].dt.year.astype(int)
            df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)

            df_q = df.groupby(['Year', 'Quarter'])["sst"].mean().reset_index()
            df_q = df_q.rename(columns={"sst": "Sea_Surface_Temp_C_East"})

            df_q["Melt_Active_East"] = (df_q["Sea_Surface_Temp_C_East"] > 0.5).astype(int)
            df_q["Melt_Index_East"] = df_q["Sea_Surface_Temp_C_East"].clip(lower=0, upper=4) / 4

            east_quarters.append(df_q)
            print(f"{year} processed.")
        except Exception as e:
            print(f"Failed for {year}: {e}")
            continue

    if east_quarters:
        df_sst_east_clean = pd.concat(east_quarters).reset_index(drop=True)
        expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_East", "Melt_Active_East", "Melt_Index_East"]
        dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_East": float, "Melt_Active_East": int, "Melt_Index_East": float}
        validate_dataframe(df_sst_east_clean, expected_columns, dtypes)

        df_sst_east_clean.to_sql('sst_east', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved SST East to SQLite table 'sst_east'")
    else:
        print("No data retrieved for SST East.")
        df_sst_east_clean = pd.DataFrame(columns=["Year", "Quarter", "Sea_Surface_Temp_C_East", "Melt_Active_East", "Melt_Index_East"])

# Final display
print("Final SST East DataFrame:")
if df_sst_east_clean is not None:
    print("Final SST East dataset shape:", df_sst_east_clean.shape)
    display(df_sst_east_clean.head())
else:
    print("Error: df_sst_east_clean not created due to API failure.")

East Greenland bounding box indices: {'lat_min': 65.0, 'lat_max': 70.0, 'lon_min': -40.0, 'lon_max': -35.0}
Loading SST East from SQLite database
Loaded DataFrame from SQLite:
Validation error: DataFrame is empty, no rows found.. Dropping and recreating table...
sst_east table not found or invalid, querying API...
Processing year: 2011
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2011-01-01T00:00:00Z):1:(2011-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-40.0):1:(-35.0)]
2011 processed.
Processing year: 2012
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2012-01-01T00:00:00Z):1:(2012-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-40.0):1:(-35.0)]
2012 processed.
Processing year: 2013
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2013-01-01T00:00:00Z):1:(2013-12-31T00:00:00Z)][0:1:0][(65.0):1:(70.0)][(-40.0):1:(-35.0)]
2013 processed.
Pro

Unnamed: 0,Year,Quarter,Sea_Surface_Temp_C_East,Melt_Active_East,Melt_Index_East
0,2011,Q1,-0.744827,0,0.0
1,2011,Q2,-0.905921,0,0.0
2,2011,Q3,2.450244,1,0.612561
3,2011,Q4,0.889492,1,0.222373
4,2012,Q1,-1.145143,0,0.0


In [74]:
# Variable 5: South Greenland SST, Melt

df_sst_south_clean = None

# Degree to ERDDAP grid index conversion for 0.25-degree grid
def deg_to_index(deg, offset, divisor=0.25):
    return int((deg + offset) / divisor)

# Define bounding box in degrees for South Greenland
bbox_deg_south = {
    'lat_min': 60.0,
    'lat_max': 65.0,
    'lon_min': -45.0,
    'lon_max': -40.0
}

# Convert to grid indices
bbox_idx_south = {
    'lat_min': deg_to_index(bbox_deg_south['lat_min'], 90),
    'lat_max': deg_to_index(bbox_deg_south['lat_max'], 90),
    'lon_min': deg_to_index(bbox_deg_south['lon_min'], 180),
    'lon_max': deg_to_index(bbox_deg_south['lon_max'], 180)
}
print("South Greenland bounding box indices:", bbox_idx_south)

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='sst_south'")
if cursor.fetchone():
    print("Loading SST South from SQLite database")
    df_sst_south_clean = pd.read_sql_query("SELECT * FROM sst_south", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_South", "Melt_Active_South", "Melt_Index_South"]
    dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_South": float, "Melt_Active_South": int, "Melt_Index_South": float}
    try:
        validate_dataframe(df_sst_south_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE sst_south")
        conn.commit()
        cursor.execute("""
            CREATE TABLE sst_south (
                Year INTEGER NOT NULL,
                Quarter TEXT NOT NULL,
                Sea_Surface_Temp_C_South REAL,
                Melt_Active_South INTEGER,
                Melt_Index_South REAL,
                PRIMARY KEY (Year, Quarter)
            )
        """)
        conn.commit()
        df_sst_south_clean = None

if df_sst_south_clean is None:
    print("sst_south table not found or invalid, querying API...")
    years = list(range(2011, 2025))
    south_quarters = []
    base = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?"
    var = "sst"
    zlev = "[0:1:0]"

    for year in years:
        print(f"Processing year: {year}")
        try:
            time = f"[({year}-01-01T00:00:00Z):1:({year}-12-31T00:00:00Z)]".replace(" ", "")
            lat = f"[{bbox_idx_south['lat_min']}:1:{bbox_idx_south['lat_max']}]"
            lon = f"[{bbox_idx_south['lon_min']}:1:{bbox_idx_south['lon_max']}]"
            query = f"{var}{time}{zlev}{lat}{lon}"
            full_url = base + quote(query, safe=":/[](),-T")
            print("Constructed URL:", full_url)

            response = fetch_with_retries(full_url, max_retries=3, timeout=120)
            if response.status_code != 200:
                raise ValueError(f"HTTP {response.status_code}: {response.text}")

            df = pd.read_csv(StringIO(response.text), skiprows=[1])
            df = df.rename(columns={col: col.strip() for col in df.columns})
            df = df.dropna(subset=["sst"])

            df["time"] = pd.to_datetime(df["time"], errors='coerce')
            if df["time"].isna().any():
                raise ValueError("Invalid datetime values in response")

            df["Year"] = df["time"].dt.year.astype(int)
            df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)

            df_q = df.groupby(['Year', 'Quarter'])["sst"].mean().reset_index()
            df_q = df_q.rename(columns={"sst": "Sea_Surface_Temp_C_South"})

            df_q["Melt_Active_South"] = (df_q["Sea_Surface_Temp_C_South"] > 0.5).astype(int)
            df_q["Melt_Index_South"] = df_q["Sea_Surface_Temp_C_South"].clip(lower=0, upper=4) / 4

            south_quarters.append(df_q)
            print(f"{year} processed.")
        except Exception as e:
            print(f"Failed for {year}: {e}")
            continue

    if south_quarters:
        df_sst_south_clean = pd.concat(south_quarters).reset_index(drop=True)
        expected_columns = ["Year", "Quarter", "Sea_Surface_Temp_C_South", "Melt_Active_South", "Melt_Index_South"]
        dtypes = {"Year": int, "Quarter": str, "Sea_Surface_Temp_C_South": float, "Melt_Active_South": int, "Melt_Index_South": float}
        validate_dataframe(df_sst_south_clean, expected_columns, dtypes)

        df_sst_south_clean.to_sql('sst_south', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved SST South to SQLite table 'sst_south'")
    else:
        print("No data retrieved for SST South.")
        df_sst_south_clean = pd.DataFrame(columns=["Year", "Quarter", "Sea_Surface_Temp_C_South", "Melt_Active_South", "Melt_Index_South"])

# Final display
print("Final SST South DataFrame:")
if df_sst_south_clean is not None:
    print("Final SST South dataset shape:", df_sst_south_clean.shape)
    display(df_sst_south_clean.head())
else:
    print("Error: df_sst_south_clean not created due to API failure.")

South Greenland bounding box indices: {'lat_min': 600, 'lat_max': 620, 'lon_min': 540, 'lon_max': 560}
Loading SST South from SQLite database
Loaded DataFrame from SQLite:
Validation error: DataFrame is empty, no rows found.. Dropping and recreating table...
sst_south table not found or invalid, querying API...
Processing year: 2011
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2011-01-01T00:00:00Z):1:(2011-12-31T00:00:00Z)][0:1:0][600:1:620][540:1:560]
2011 processed.
Processing year: 2012
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2012-01-01T00:00:00Z):1:(2012-12-31T00:00:00Z)][0:1:0][600:1:620][540:1:560]
2012 processed.
Processing year: 2013
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2013-01-01T00:00:00Z):1:(2013-12-31T00:00:00Z)][0:1:0][600:1:620][540:1:560]
2013 processed.
Processing year: 2014
Constructed URL: https://c

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["time"] = pd.to_datetime(df["time"], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Year"] = df["time"].dt.year.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Quarter"] = "Q" + df["time"].dt.quarter.astype(str)


2019 processed.
Processing year: 2020
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2020-01-01T00:00:00Z):1:(2020-12-31T00:00:00Z)][0:1:0][600:1:620][540:1:560]
2020 processed.
Processing year: 2021
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2021-01-01T00:00:00Z):1:(2021-12-31T00:00:00Z)][0:1:0][600:1:620][540:1:560]
2021 processed.
Processing year: 2022
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2022-01-01T00:00:00Z):1:(2022-12-31T00:00:00Z)][0:1:0][600:1:620][540:1:560]
2022 processed.
Processing year: 2023
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2023-01-01T00:00:00Z):1:(2023-12-31T00:00:00Z)][0:1:0][600:1:620][540:1:560]
2023 processed.
Processing year: 2024
Constructed URL: https://coastwatch.pfeg.noaa.gov/erddap/griddap/ncdcOisst21Agg_LonPM180.csv?sst[(2024-01-01T0

Unnamed: 0,Year,Quarter,Sea_Surface_Temp_C_South,Melt_Active_South,Melt_Index_South
0,2011,Q1,3.231249,1,0.807812
1,2011,Q2,2.247865,1,0.561966
2,2011,Q3,5.861202,1,1.0
3,2011,Q4,4.028882,1,1.0
4,2012,Q1,1.64393,1,0.410982


In [75]:
# Variable 6: Total Catch by Foreign Vessels

df_foreign_clean = None

# Check if data exists in the database
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='foreign_catch'")
if cursor.fetchone():
    print("Loading Foreign Catch from SQLite database")
    df_foreign_clean = pd.read_sql_query("SELECT * FROM foreign_catch", conn)
    print("Loaded DataFrame from SQLite:")
    expected_columns = ["Year", "Quarter", "Unit", "Foreign_Catch"]
    dtypes = {"Year": int, "Quarter": str, "Unit": str, "Foreign_Catch": int}
    try:
        validate_dataframe(df_foreign_clean, expected_columns, dtypes)
        print("Loaded data is valid.")
    except ValueError as e:
        print(f"Validation error: {e}. Dropping and recreating table...")
        cursor.execute("DROP TABLE foreign_catch")
        conn.commit()
        # Recreate the table
        cursor.execute("""
            CREATE TABLE foreign_catch (
                Year INTEGER NOT NULL,
                Quarter TEXT NOT NULL,
                Unit TEXT,
                Foreign_Catch INTEGER,
                PRIMARY KEY (Year, Quarter)
            )
        """)
        conn.commit()
        df_foreign_clean = None

if df_foreign_clean is None:
    print("foreign_catch table not found or invalid, querying API...")
    url = "https://bank.stat.gl:443/api/v1/en/Greenland/FI/FI10/FIX008.px"
    query = {
        "query": [
            {"code": "nation", "selection": {"filter": "item", "values": ["UDL"]}},
            {"code": "unit", "selection": {"filter": "item", "values": ["Ton"]}},
            {"code": "time", "selection": {"filter": "item", "values": [str(y) for y in range(2011, 2025)]}},
            {"code": "quarter", "selection": {"filter": "item", "values": ["1", "2", "3", "4"]}}
        ],
        "response": {"format": "json-stat2"}
    }
    try:
        response = fetch_with_retries(url, max_retries=3, timeout=60, method='post', json=query)
        dataset = pyjstat.Dataset.read(response.text)
        df = dataset.write('dataframe')
        print("Data successfully retrieved and converted to DataFrame!")

        # Clean DataFrame
        df_foreign_clean = df.copy()
        df_foreign_clean.drop(columns=['nation'], inplace=True)
        df_foreign_clean.rename(columns={
            "time": "Year",
            "quarter": "Quarter",
            "unit": "Unit",
            "value": "Foreign_Catch"
        }, inplace=True)
        df_foreign_clean["Quarter"] = df_foreign_clean["Quarter"].str.replace("Quarter ", "Q")
        quarter_order = ["Q1", "Q2", "Q3", "Q4"]
        df_foreign_clean["Quarter"] = pd.Categorical(df_foreign_clean["Quarter"], categories=quarter_order, ordered=True)
        df_foreign_clean = df_foreign_clean[["Year", "Quarter", "Unit", "Foreign_Catch"]]
        df_foreign_clean["Year"] = df_foreign_clean["Year"].astype(int)

        # Validate before saving to SQLite
        expected_columns = ["Year", "Quarter", "Unit", "Foreign_Catch"]
        dtypes = {"Year": int, "Quarter": str, "Unit": str, "Foreign_Catch": int}
        validate_dataframe(df_foreign_clean, expected_columns, dtypes)

        # Insert data into table
        df_foreign_clean.to_sql('foreign_catch', conn, if_exists='append', index=False)
        conn.commit()
        print("Saved Foreign Catch to SQLite table 'foreign_catch'")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching Foreign Catch data: {e}")
        df_foreign_clean = pd.DataFrame(columns=["Year", "Quarter", "Unit", "Foreign_Catch"])  # Empty DataFrame as fallback

# Final display
print("Final Foreign Catch DataFrame:")
display(df_foreign_clean.head())

Loading Foreign Catch from SQLite database
Loaded DataFrame from SQLite:
Loaded data is valid.
Final Foreign Catch DataFrame:


Unnamed: 0,Year,Quarter,Unit,Foreign_Catch
0,2011,Q1,Tonnes,3354
1,2011,Q2,Tonnes,5375
2,2011,Q3,Tonnes,78396
3,2011,Q4,Tonnes,9673
4,2012,Q1,Tonnes,2429


In [None]:
#save in memory database to a file

import sqlite3

# Create a new on-disk database
disk_conn = sqlite3.connect('/content/greenland_fishery.db')

# Backup the in-memory database to the on-disk database
with disk_conn:
    conn.backup(disk_conn)

# Close the on-disk connection
disk_conn.close()

print("In-memory database saved to /content/greenland_fishery.db")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Copy the database file to Google Drive
!cp /content/greenland_fishery.db /content/drive/MyDrive/greenland_fishery.db
print("Database saved to Google Drive at /content/drive/MyDrive/greenland_fishery.db")