# DSC106 - Project 3

In [2]:
import pandas as pd
import xarray as xr
import gcsfs
import numpy as np
import sys
import warnings

Processes the data from the CMIP6 dataset into a CSV for our D3 visualizations.

**Note:** This cell takes a long time to run

In [4]:
pd.read_json('https://cdn.jsdelivr.net/npm/us-atlas@3/states-albers-10m.json')

ValueError: All arrays must be of the same length

In [None]:
# Suppress warnings for cleaner output
warnings.filterwarnings('ignore', category=UserWarning, message='Sending large graph to Dask')
warnings.filterwarnings('ignore', category=RuntimeWarning, message='Mean of empty slice')

print("Starting data processing for d3.js visualizations...")

# --- Configuration ---
# We'll use 'r4i1p1f1', a variant common to all three experiments for the CESM2 model
MEMBER_ID = 'r4i1p1f1' 
SOURCE_ID = 'CESM2'
TABLE_ID = 'Amon'
VARIABLE_ID = 'tas'
EXPERIMENTS = ['historical', 'ssp245', 'ssp585'] # Baseline, medium-future, high-emissions-future

# Define US regions. Longitude is 0-360 in these models.
# W longitude = 360 - W.
REGIONS = {
    'Northeast': {'lat': slice(39, 48), 'lon': slice(360-81, 360-67)}, # 279-293
    'Southeast': {'lat': slice(25, 39), 'lon': slice(360-95, 360-75)}, # 265-285
    'Midwest':   {'lat': slice(37, 49), 'lon': slice(360-104, 360-81)},# 256-279
    'West':      {'lat': slice(32, 49), 'lon': slice(360-125, 360-104)} # 235-256
}
# ---------------------

try:
    # Load the main dataframe
    # df = pd.read_csv('https://storage.googleapis.com/cmip6/cmip6-zarr-consolidated-stores.csv', index_col=0)
    df = pd.read_csv('../data/data.csv', index_col=0) # data.csv is the CMIP6 data as a CSV
    print("Loaded data.csv index.")

    # Initialize GCS FileSystem (anonymous access)
    gcs = gcsfs.GCSFileSystem(token='anon')

    all_regional_data = []

    # Loop over each experiment
    for exp in EXPERIMENTS:
        print(f"\n--- Processing: {exp} ---")
        
        # Find the zstore URL for this specific experiment and member_id
        query = (
            f"source_id == '{SOURCE_ID}' & "
            f"member_id == '{MEMBER_ID}' & "
            f"experiment_id == '{exp}' & "
            f"table_id == '{TABLE_ID}' & "
            f"variable_id == '{VARIABLE_ID}'"
        )
        
        df_exp = df.query(query)
        
        if df_exp.empty:
            print(f"Warning: No data found for query: {query}")
            continue
            
        zstore = df_exp.iloc[0]['zstore']
        print(f"Found zstore: {zstore}")

        # Open the dataset
        try:
            mapper = gcs.get_mapper(zstore)
            ds = xr.open_zarr(mapper, consolidated=True)
        except Exception as e:
            print(f"Error opening zarr store {zstore}: {e}")
            continue

        # Loop over each region
        for region_name, region_box in REGIONS.items():
            print(f"Processing region: {region_name}")
            
            try:
                # 1. Select the region
                ds_region = ds.sel(lat=region_box['lat'], lon=region_box['lon'])
                
                # 2. Create latitude weights for accurate averaging
                weights = np.cos(np.deg2rad(ds_region.lat))
                weights.name = 'weights'
                
                # 3. Calculate the weighted spatial mean
                ds_weighted_mean = ds_region.weighted(weights).mean(dim=['lat', 'lon'])
                
                # 4. Select only July data (month == 7) as proxy for "extreme" heat
                ds_july = ds_weighted_mean.sel(time=ds_weighted_mean.time.dt.month == 7)
                
                # 5. Convert from Kelvin to Celsius
                temp_c = ds_july['tas'] - 273.15
                
                # 6. Trigger computation
                print("Computing values...")
                temp_c_computed = temp_c.compute()
                print("...computation complete.")

                # 7. Convert to Pandas DataFrame
                df_temp = temp_c_computed.to_dataframe()
                
                if df_temp.empty:
                    print(f"Warning: No data after processing for {region_name}, {exp}")
                    continue

                # 8. Clean up the DataFrame
                df_temp['year'] = df_temp.index.year
                df_temp = df_temp.reset_index(drop=True)[['year', 'tas']]
                df_temp = df_temp.rename(columns={'tas': 'july_temp_c'})
                df_temp['region'] = region_name
                df_temp['scenario'] = exp
                
                # 9. Append to our master list
                all_regional_data.append(df_temp)
                
            except Exception as e:
                print(f"Error processing {region_name} for {exp}: {e}")

        # Close the dataset
        ds.close()

    # Concatenate all dataframes
    if all_regional_data:
        print("\n--- Concatenating all results ---")
        final_df = pd.concat(all_regional_data, ignore_index=True)
        
        # Save to CSV
        output_filename = 'us_regional_july_temps.csv'
        final_df.to_csv(output_filename, index=False)
        
        print(f"\n✅ Success! Data processed and saved to '{output_filename}'.")
        print("Final DataFrame head:")
        print(final_df.head())
    else:
        print("\nNo data was processed. Output file not created.")

except FileNotFoundError:
    print("Error: data.csv not found. Make sure it's in the same directory.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Starting data processing for d3.js visualizations...
Loaded data.csv index.

--- Processing: historical ---
Found zstore: gs://cmip6/CMIP6/CMIP/NCAR/CESM2/historical/r4i1p1f1/Amon/tas/gn/v20190308/
Processing region: Northeast
Computing values...
...computation complete.
Processing region: Southeast
Computing values...
...computation complete.
Processing region: Midwest
Computing values...
...computation complete.
Processing region: West
Computing values...
...computation complete.

--- Processing: ssp245 ---
Found zstore: gs://cmip6/CMIP6/ScenarioMIP/NCAR/CESM2/ssp245/r4i1p1f1/Amon/tas/gn/v20200528/
Processing region: Northeast
Computing values...
...computation complete.
Processing region: Southeast
Computing values...
...computation complete.
Processing region: Midwest
Computing values...
...computation complete.
Processing region: West
Computing values...
...computation complete.

--- Processing: ssp585 ---
Found zstore: gs://cmip6/CMIP6/ScenarioMIP/NCAR/CESM2/ssp585/r4i1p1f1/Amon/

Converts CSV to JSON

In [None]:
print("--- Processing 'us_regional_july_temps.csv' for D3.js Map ---")

# Define file names
input_csv = '../data/us_regional_july_temps.csv'
output_json = 'us_temps_for_map_2025-2100.json'

try:
    # 1. Load the CSV file
    print(f"Loading '{input_csv}'...")
    df = pd.read_csv(input_csv)
    print("CSV loaded successfully.")

    # 2. Filter data for the research question (2025-2100)
    print(f"Filtering data for years 2025-2100...")
    df_filtered = df[(df['year'] >= 2025) & (df['year'] <= 2100)].copy()
    
    if df_filtered.empty:
        print("\nWarning: No data found for years 2025-2100.")
        print("Please check your 'us_regional_july_temps.csv' file.")
    else:
        print(f"Found {len(df_filtered)} records in this date range.")

        # 3. Round temperature for cleaner data
        if 'july_temp_c' in df_filtered.columns:
            df_filtered['july_temp_c'] = df_filtered['july_temp_c'].round(2)
            print("Rounded 'july_temp_c' to 2 decimal places.")
        else:
            print("\nError: 'july_temp_c' column not found.", file=sys.stderr)
            sys.exit(1)
            
        # 4. Save to JSON in 'records' format
        output_cols = ['year', 'region', 'scenario', 'july_temp_c']
        df_final = df_filtered[output_cols]
        df_final.to_json(output_json, orient='records')
        
        print(f"\n✅ Success! Data saved to '{output_json}'.")
        print("This file is ready for your D3.js project.")
        
        print("\n--- Example Data (first 10 records) ---")
        # Use .to_json() for a cleaner print that matches the file output
        print(df_final.head(10).to_json(orient='records', indent=2))

except FileNotFoundError:
    print(f"\nError: The file '{input_csv}' was not found.", file=sys.stderr)
except KeyError as e:
    print(f"\nError: A required column is missing: {e}", file=sys.stderr)
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}", file=sys.stderr)

--- Processing 'us_regional_july_temps.csv' for D3.js Map ---
Loading '../data/us_regional_july_temps.csv'...
CSV loaded successfully.
Filtering data for years 2025-2100...
Found 608 records in this date range.
Rounded 'july_temp_c' to 2 decimal places.

✅ Success! Data saved to 'us_temps_for_map_2025-2100.json'.
This file is ready for your D3.js project.

--- Example Data (first 10 records) ---
[
  {
    "year":2025,
    "region":"Northeast",
    "scenario":"ssp245",
    "july_temp_c":24.28
  },
  {
    "year":2026,
    "region":"Northeast",
    "scenario":"ssp245",
    "july_temp_c":24.45
  },
  {
    "year":2027,
    "region":"Northeast",
    "scenario":"ssp245",
    "july_temp_c":22.34
  },
  {
    "year":2028,
    "region":"Northeast",
    "scenario":"ssp245",
    "july_temp_c":22.18
  },
  {
    "year":2029,
    "region":"Northeast",
    "scenario":"ssp245",
    "july_temp_c":22.91
  },
  {
    "year":2030,
    "region":"Northeast",
    "scenario":"ssp245",
    "july_temp_c":22.2