In [1]:
!pip install xarray netCDF4 h5netcdf s3fs pandas matplotlib




In [2]:
import xarray as xr
import pandas as pd
import numpy as np
import s3fs
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime, timedelta


In [None]:
fs = s3fs.S3FileSystem(anon=True)

# Directory with your C01 file
#deprecated (only one hour):
# prefix = "noaa-goes18/ABI-L2-CMIPF/2025/334/18/"
day_prefix = "noaa-goes18

You then fill in:

    The actual data structures (arrays for biases and factors).

    The predict method logic.

    Hyperparameters: learning rate, regularization, epochs, number of factors, and threshold.
/ABI-L2-CMIPF/2025/334/"

hours = range(18, 24)
# List all files in that directory
#deprecated
# files = fs.ls(prefix)
#len(files), files[:5]
files = []
for h in hours:
    prefix = f"{day_prefix}{h:02d}/"
    files.extend(fs.ls(prefix))

print("Total files:", len(files))

Total files: 576


In [None]:

c13_files = [f for f in files if "M6C13" in f]
len(c13_files)


36

In [13]:
# Sort them to be in time order
c13_files = sorted(c13_files)

# Pick a subset, e.g. every other file, or just first 6
step = max(1, len(c13_files) // 6)
selected_files = c13_files[::step][:6]

selected_files


['noaa-goes18/ABI-L2-CMIPF/2025/334/18/OR_ABI-L2-CMIPF-M6C13_G18_s20253341800214_e20253341809534_c20253341809590.nc',
 'noaa-goes18/ABI-L2-CMIPF/2025/334/19/OR_ABI-L2-CMIPF-M6C13_G18_s20253341900214_e20253341909535_c20253341909594.nc',
 'noaa-goes18/ABI-L2-CMIPF/2025/334/20/OR_ABI-L2-CMIPF-M6C13_G18_s20253342000215_e20253342009535_c20253342009596.nc',
 'noaa-goes18/ABI-L2-CMIPF/2025/334/21/OR_ABI-L2-CMIPF-M6C13_G18_s20253342100212_e20253342109533_c20253342109596.nc',
 'noaa-goes18/ABI-L2-CMIPF/2025/334/22/OR_ABI-L2-CMIPF-M6C13_G18_s20253342200212_e20253342209533_c20253342209597.nc',
 'noaa-goes18/ABI-L2-CMIPF/2025/334/23/OR_ABI-L2-CMIPF-M6C13_G18_s20253342300213_e20253342309533_c20253342309592.nc']

In [14]:
def parse_goes_time_from_key(key):
    # key like "noaa-goes18/ABI-L2-CMIPF/2025/334/18/OR_ABI-L2-CMIPF-M6C13_G18_sYYYYJJJHHMMSS_e..."
    fname = key.split('/')[-1]
    s_index = fname.find('_s')
    e_index = fname.find('_e')
    timestr = fname[s_index+2:e_index]  # YYYYJJJHHMMSS
    year = int(timestr[0:4])
    jday = int(timestr[4:7])
    hour = int(timestr[7:9])
    minute = int(timestr[9:11])
    second = int(timestr[11:13])

    dt = datetime(year, 1, 1) + timedelta(days=jday-1, hours=hour, minutes=minute, seconds=second)
    return dt


In [15]:
import xarray as xr
import numpy as np
import pandas as pd
from pathlib import Path

# Pick the first C13 file
key0 = selected_files[0]
print("Using first file:", key0)

with fs.open(key0, mode="rb") as f:
    # Try h5netcdf first
    try:
        ds0 = xr.open_dataset(f, engine="h5netcdf")
    except Exception as e:
        print("h5netcdf failed, trying netcdf4:", e)
        f.seek(0)
        ds0 = xr.open_dataset(f, engine="netcdf4")

    # IMPORTANT: load everything into memory before leaving the 'with' block
    ds0.load()

# Now it's safe to work with ds0 outside the with-block
print(ds0)

CMI0 = ds0["CMI"]
print("CMI0 shape/dims:", CMI0.shape, CMI0.dims)

# Choose coarse grid size
n_rows = 50
n_cols = 50

ny, nx = CMI0.shape
fac_y = ny // n_rows
fac_x = nx // n_cols

CMI0_small = CMI0.coarsen(y=fac_y, x=fac_x, boundary="trim").mean()
print("Downsampled shape:", CMI0_small.shape)



Using first file: noaa-goes18/ABI-L2-CMIPF/2025/334/18/OR_ABI-L2-CMIPF-M6C13_G18_s20253341800214_e20253341809534_c20253341809590.nc
<xarray.Dataset>
Dimensions:                                           (y: 5424, x: 5424,
                                                       number_of_time_bounds: 2,
                                                       number_of_image_bounds: 2,
                                                       band: 1)
Coordinates:
    t                                                 datetime64[ns] 2025-11-...
  * y                                                 (y) float64 0.1518 ... ...
  * x                                                 (x) float64 -0.1518 ......
    y_image                                           float32 0.0
    x_image                                           float32 0.0
    band_wavelength                                   (band) float32 10.33
    band_id                                           (band) int32 13
Dimensions without

In [16]:
small_ny, small_nx = CMI0_small.shape
small_ny, small_nx

cells = []
cell_id = 0

for row in range(small_ny):
    for col in range(small_nx):
        x_norm = (col + 0.5) / small_nx
        y_norm = (row + 0.5) / small_ny
        width_norm = 1.0 / small_nx
        height_norm = 1.0 / small_ny

        cells.append({
            "cell_id": cell_id,
            "row": row,
            "col": col,
            "x_norm": x_norm,
            "y_norm": y_norm,
            "width_norm": width_norm,
            "height_norm": height_norm,
            "lat": np.nan,
            "lon": np.nan,
        })

        cell_id += 1

cells_df = pd.DataFrame(cells)
Path("data").mkdir(exist_ok=True)
cells_df.to_csv("data/cells.csv", index=False)
cells_df.head()


Unnamed: 0,cell_id,row,col,x_norm,y_norm,width_norm,height_norm,lat,lon
0,0,0,0,0.01,0.01,0.02,0.02,,
1,1,0,1,0.03,0.01,0.02,0.02,,
2,2,0,2,0.05,0.01,0.02,0.02,,
3,3,0,3,0.07,0.01,0.02,0.02,,
4,4,0,4,0.09,0.01,0.02,0.02,,


In [18]:
from datetime import datetime, timedelta
import matplotlib.pyplot as plt

Path("images").mkdir(exist_ok=True)

def parse_goes_time_from_key(key):
    fname = key.split("/")[-1]
    s_index = fname.find("_s")
    e_index = fname.find("_e")
    timestr = fname[s_index+2:e_index]  # YYYYJJJHHMMSS
    year = int(timestr[0:4])
    jday = int(timestr[4:7])
    hour = int(timestr[7:9])
    minute = int(timestr[9:11])
    second = int(timestr[11:13])
    dt = datetime(year, 1, 1) + timedelta(days=jday-1, hours=hour, minutes=minute, seconds=second)
    return dt

times_rows = []
values_rows = []

for frame_index, key in enumerate(selected_files):
    print("Processing frame", frame_index, key)

    with fs.open(key, mode="rb") as f:
        # Try h5netcdf first
        try:
            ds = xr.open_dataset(f, engine="h5netcdf")
        except Exception as e:
            print("h5netcdf failed, trying netcdf4:", e)
            f.seek(0)
            ds = xr.open_dataset(f, engine="netcdf4")

        ds.load()  # load into memory

    CMI = ds["CMI"]

    # Use same fac_y/fac_x as computed from ds0
    CMI_small = CMI.coarsen(y=fac_y, x=fac_x, boundary="trim").mean()
    arr = CMI_small.values  # (small_ny, small_nx)

    # 1) Save PNG
    vmin = np.nanpercentile(arr, 2)
    vmax = np.nanpercentile(arr, 98)

    fig, ax = plt.subplots(figsize=(5, 5), dpi=180)
    ax.imshow(arr, origin="upper", cmap="inferno", vmin=vmin, vmax=vmax)
    ax.axis("off")

    img_name = f"goes_c13_frame_{frame_index:02d}.png"
    img_path = Path("images") / img_name
    fig.savefig(img_path, bbox_inches="tight", pad_inches=0)
    plt.close(fig)

    # 2) times.csv row
    dt = parse_goes_time_from_key(key)
    times_rows.append({
        "frame_index": frame_index,
        "time_iso": dt.isoformat() + "Z",
        "image_url": f"images/{img_name}",
    })

    # 3) values.csv rows
    for row in range(small_ny):
        for col in range(small_nx):
            cid = row * small_nx + col
            bt = float(arr[row, col]) if not np.isnan(arr[row, col]) else None
            values_rows.append({
                "frame_index": frame_index,
                "cell_id": cid,
                "brightness_temp": bt,
            })

times_df = pd.DataFrame(times_rows)
values_df = pd.DataFrame(values_rows)

times_df.to_csv("data/times.csv", index=False)
values_df.to_csv("data/values.csv", index=False)

times_df.head(), values_df.head()


Processing frame 0 noaa-goes18/ABI-L2-CMIPF/2025/334/18/OR_ABI-L2-CMIPF-M6C13_G18_s20253341800214_e20253341809534_c20253341809590.nc
Processing frame 1 noaa-goes18/ABI-L2-CMIPF/2025/334/19/OR_ABI-L2-CMIPF-M6C13_G18_s20253341900214_e20253341909535_c20253341909594.nc
Processing frame 2 noaa-goes18/ABI-L2-CMIPF/2025/334/20/OR_ABI-L2-CMIPF-M6C13_G18_s20253342000215_e20253342009535_c20253342009596.nc
Processing frame 3 noaa-goes18/ABI-L2-CMIPF/2025/334/21/OR_ABI-L2-CMIPF-M6C13_G18_s20253342100212_e20253342109533_c20253342109596.nc
Processing frame 4 noaa-goes18/ABI-L2-CMIPF/2025/334/22/OR_ABI-L2-CMIPF-M6C13_G18_s20253342200212_e20253342209533_c20253342209597.nc
Processing frame 5 noaa-goes18/ABI-L2-CMIPF/2025/334/23/OR_ABI-L2-CMIPF-M6C13_G18_s20253342300213_e20253342309533_c20253342309592.nc


(   frame_index              time_iso                     image_url
 0            0  2025-11-30T18:00:21Z  images/goes_c13_frame_00.png
 1            1  2025-11-30T19:00:21Z  images/goes_c13_frame_01.png
 2            2  2025-11-30T20:00:21Z  images/goes_c13_frame_02.png
 3            3  2025-11-30T21:00:21Z  images/goes_c13_frame_03.png
 4            4  2025-11-30T22:00:21Z  images/goes_c13_frame_04.png,
    frame_index  cell_id  brightness_temp
 0            0        0              NaN
 1            0        1              NaN
 2            0        2              NaN
 3            0        3              NaN
 4            0        4              NaN)