### Misc. Notes
CalFish databases are on ERDDAP > tabledap

In [3]:
# Test imports
import pydap
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns

# Verify versions
print(f"Python version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

Python version: 2.2.3
NumPy version: 2.1.2
Pandas version: 2.2.3


In [4]:
import requests
import pandas as pd
from datetime import datetime
from io import StringIO  # Add this import

# Use the direct CSV endpoint which is typically faster than DAP
base_url = "https://upwell.pfeg.noaa.gov/erddap/tabledap/erdCAMarCatLM.csv"

# First just get a small sample to test (one month of data)
test_query = f"{base_url}?time,year,fish,port,landings&time>=2002-01-01&time<=2002-02-01"

try:
    # Set a reasonable timeout
    print("Attempting to fetch sample data...")
    response = requests.get(test_query, timeout=30)
    
    if response.status_code == 200:
        # Convert to pandas DataFrame
        df = pd.read_csv(StringIO(response.text), skiprows=[1])  # Skip units row
        print("\nSuccessfully retrieved sample data!")
        print(f"\nShape of sample: {df.shape}")
        print("\nFirst few rows:")
        print(df.head())
        
        # Get column info
        print("\nColumns:")
        print(df.columns.tolist())
        
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        print(f"Response text: {response.text[:500]}")  # Print first 500 chars of response
        
except requests.exceptions.Timeout:
    print("Request timed out - try again or reduce the time range")
except Exception as e:
    print(f"Error: {e}")

Attempting to fetch sample data...

Successfully retrieved sample data!

Shape of sample: (2387, 5)

First few rows:
                   time  year            fish         port  landings
0  2002-01-16T00:00:00Z  2002  Abalone, Black          All         0
1  2002-01-16T00:00:00Z  2002  Abalone, Black       Eureka         0
2  2002-01-16T00:00:00Z  2002  Abalone, Black  Los Angeles         0
3  2002-01-16T00:00:00Z  2002  Abalone, Black     Monterey         0
4  2002-01-16T00:00:00Z  2002  Abalone, Black    San Diego         0

Columns:
['time', 'year', 'fish', 'port', 'landings']


In [5]:
import requests
import pandas as pd
from io import StringIO

# Use the direct CSV endpoint
base_url = "https://upwell.pfeg.noaa.gov/erddap/tabledap/erdCAMarCatLM.csv"

# Query for the full dataset
full_query = f"{base_url}?time,year,fish,port,landings&time>=1928-01-16&time<=2002-12-16T00:00:00Z"

try:
    print("Downloading full dataset...")
    response = requests.get(full_query, timeout=60)  # Increased timeout for full dataset
    
    if response.status_code == 200:
        # Convert to pandas DataFrame
        df = pd.read_csv(StringIO(response.text), skiprows=[1])  # Skip units row
        
        print("\nFull dataset retrieved!")
        print(f"Shape of dataset: {df.shape}")
        print("\nSample of data:")
        print(df.head())
        
        # Basic data info
        print("\nDataset Information:")
        print(f"Number of unique years: {df['year'].nunique()}")
        print(f"Number of unique fish species: {df['fish'].nunique()}")
        print(f"Number of unique ports: {df['port'].nunique()}")
        print(f"Total landings (pounds): {df['landings'].sum():,}")
        
        # Save to CSV
        # output_file = "california_fish_landings_1928_2002.csv"
        # df.to_csv(output_file, index=False)
        # print(f"\nData saved to {output_file}")
        
    else:
        print(f"Failed to retrieve data. Status code: {response.status_code}")
        print(f"Response text: {response.text[:500]}")
        
except requests.exceptions.Timeout:
    print("Request timed out. The dataset might be too large for a single download.")
    print("Would you like to try downloading in smaller time chunks instead?")
except Exception as e:
    print(f"Error: {e}")

Downloading full dataset...

Full dataset retrieved!
Shape of dataset: (2148300, 5)

Sample of data:
                   time  year            fish port  landings
0  1928-01-16T00:00:00Z  1928  Abalone, Black  All         0
1  1928-02-16T00:00:00Z  1928  Abalone, Black  All         0
2  1928-03-16T00:00:00Z  1928  Abalone, Black  All         0
3  1928-04-16T00:00:00Z  1928  Abalone, Black  All         0
4  1928-05-16T00:00:00Z  1928  Abalone, Black  All         0

Dataset Information:
Number of unique years: 75
Number of unique fish species: 341
Number of unique ports: 7
Total landings (pounds): 79,628,657,334
