In [8]:
import os
from typing import Union
import zarr
import s3fs

import dask.array as da
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import sunpy.visualization.colormaps as cm

In [9]:
# setting configs

matplotlib.use('Agg')

In [10]:
def s3_connection(path_to_zarr: os.path) -> s3fs.S3Map:
    """
    Instantiate connection to aws for a given path `path_to_zarr`
    """
    return s3fs.S3Map(
        root=path_to_zarr,
        s3=s3fs.S3FileSystem(anon=True),
        # anonymous access requires no credentials
        check=False,
    )

def load_single_aws_zarr(
    path_to_zarr: os.path,
    cache_max_single_size: int = None,
) -> Union[zarr.Array, zarr.Group]:
    """
    load zarr from s3 using LRU cache
    """
    return zarr.open(
        zarr.LRUStoreCache(
            store=s3_connection(path_to_zarr),
            max_size=cache_max_single_size,
        ),
        mode="r",
    )

def get_single_solar_image(image_idx, path_to_zarr):
    images_drry = da.from_array(load_single_aws_zarr(path_to_zarr)["171A"])
    image = np.array(images_drry[image_idx, :, :])
    return image


In [16]:
import glob
import re
import pickle

def get_sdo_solar_images_from_aws(
    s3_root_for_sdoml_year_zarr,
    desired_times,
    sav_folder_path,
    tolerance,
):

    # for desired_times, get closest times in the zarr file and corresponding indices:
    #   images_zry_closest_idxs, images_closest_times
    images_171a_zarray = load_single_aws_zarr(
        path_to_zarr=s3_root_for_sdoml_year_zarr,
    )["171A"]
    images_zry_closest_idxs = []
    # images_zry_times = pd.to_datetime(np.array(images_171a_zarray.attrs["T_OBS"]))

    # TEMP: pick up images_zry_times from local
    # pickle.dump(images_zry_times, open('temp_images_zry_times.pkl', 'wb'))
    images_zry_times = pickle.load(open('temp_images_zry_times.pkl', 'rb'))

    for selected_time in desired_times[None:None]:
        images_zry_closest_idxs.append(np.argmin(abs(images_zry_times - selected_time)))
    images_zry_closest_idxs = sorted(set(images_zry_closest_idxs))
    images_closest_times = images_zry_times[images_zry_closest_idxs]

    # get the image_times that have been processed already: images_processed_times
    images_png_folder = sav_folder_path
    images_processed_paths = glob.glob(os.path.join(images_png_folder, "*.png"))
    images_processed_times = [
        pd.to_datetime(re.sub(".png", "", os.path.basename(path)))
        for path in images_processed_paths
    ]

    # fetch images
    fetched_images_paths = []
    for image_time in images_closest_times[None:None]:
        current_img_time = image_time

        # get the position of image_time in images_closest_times
        image_time_idx = list(images_closest_times).index(image_time)

        # check if the images_processed_times contains the row currently being processed and skip iter if true
        if current_img_time in images_processed_times:
            # print('current_img_time:', current_img_time, 'images_processed_times:', images_processed_times)
            print(
                f"Skipping image_time_idx {image_time_idx} as it has been processed already."
            )
            continue

        # get current image
        image_arr = get_single_solar_image(images_zry_closest_idxs[image_time_idx], s3_root_for_sdoml_year_zarr)
        downsampled_pxl_posns = np.arange(0, image_arr.shape[0], 2)
        image_arr = image_arr[downsampled_pxl_posns, :][:, downsampled_pxl_posns]

        # Save the image
        fig = plt.figure(figsize=(5, 5))
        plt.imshow(image_arr, origin="lower", vmin=10, vmax=1000, cmap=plt.get_cmap("sdoaia171"))
        image_path = f"{images_png_folder}/{current_img_time}.png"
        plt.savefig(image_path)
        plt.close("all")  # Close the figure manually to release resources

        print(
            f"fetched image_time_idx: {image_time_idx} of {len(images_closest_times)}"
        )

        fetched_images_paths.append(image_path)

    return fetched_images_paths

In [17]:
s3_root_for_sdoml_year_zarr = (
    "s3://gov-nasa-hdrl-data1/contrib/fdl-sdoml/fdl-sdoml-v2/sdomlv2.zarr/2015/"
)
desired_times = pd.date_range(
    start="2015-01-01 00:00:00", end="2015-12-31 23:59:59", freq="60T", tz="UTC"
)
sav_folder_path = "/Users/aishsk6/gd_to_be_archived_big_files/sdo_image_data"

get_sdo_solar_images_from_aws(
    s3_root_for_sdoml_year_zarr,
    desired_times,
    sav_folder_path,
    tolerance=pd.Timedelta(days=1000),
)

KeyboardInterrupt: 

In [15]:
import pickle

times = pd.to_datetime(["2000", "2005"])
pickle.dump(times, open('images_zry_times.pkl', 'wb'))

times = pickle.load(open('images_zry_times.pkl', 'rb'))
times

DatetimeIndex(['2000-01-01', '2005-01-01'], dtype='datetime64[ns]', freq=None)

In [6]:
# get the goes flare events

import pandas as pd

goes_events_data = pd.read_csv('goes_events_clean_2015.csv')

# Filter the original dataframe for rows where 'Particulars_a' starts with 'M' or 'X'
goes_MX_events = goes_events_data[goes_events_data['Particulars_a'].str.startswith(('M', 'X'))]

goes_MX_event_times = pd.to_datetime(goes_MX_events['max_datetime'].fillna(
    goes_MX_events['begin_datetime']), utc=True).tolist()


In [7]:
s3_root_for_sdoml_year_zarr = (
    "s3://gov-nasa-hdrl-data1/contrib/fdl-sdoml/fdl-sdoml-v2/sdomlv2.zarr/2015/"
)
desired_times = goes_MX_event_times

sav_folder_path = (
    "/Users/aishsk6/gd_to_be_archived_big_files/sdo_image_data_goes_events/"
)

get_sdo_solar_images_from_aws(
    s3_root_for_sdoml_year_zarr,
    desired_times,
    sav_folder_path,
    tolerance=pd.Timedelta(minutes=10),
)

Skipping image_time_idx 0 as it has been processed already.
Skipping image_time_idx 1 as it has been processed already.
Skipping image_time_idx 2 as it has been processed already.
Skipping image_time_idx 3 as it has been processed already.
Skipping image_time_idx 4 as it has been processed already.
Skipping image_time_idx 5 as it has been processed already.
Skipping image_time_idx 6 as it has been processed already.
Skipping image_time_idx 7 as it has been processed already.
Skipping image_time_idx 8 as it has been processed already.
Skipping image_time_idx 9 as it has been processed already.
Skipping image_time_idx 10 as it has been processed already.
Skipping image_time_idx 11 as it has been processed already.
Skipping image_time_idx 12 as it has been processed already.
Skipping image_time_idx 13 as it has been processed already.
Skipping image_time_idx 14 as it has been processed already.
Skipping image_time_idx 15 as it has been processed already.
Skipping image_time_idx 16 as it h

[]