In [12]:
!python -m pip install -qq pandas xarray matplotlib netcdf4 pyproj pyrsig pycno

In [13]:
import pyproj
import xarray as xr
import pyrsig
import pandas as pd
import pycno
import matplotlib.pyplot as plt
import getpass
import numpy as np

In [14]:
dates = pd.date_range(start="2025-01-02", end="2025-01-08", freq="D")

# Convert to list of strings in YYYY-MM-DD format if needed
dates = dates.strftime("%Y-%m-%d").tolist()

In [15]:
keys_of_data = [
    "tempo.l3.o3tot.column_amount_o3",
    "tempo.l2.no2.vertical_column_total",
    "tempo.l2.hcho.vertical_column",

    ]

In [16]:
# Suppose your dataframe is called df

def unique_or_mean(series):
    vals = series.dropna().unique()
    if len(vals) == 1:
        return vals[0]              # only one unique value
    else:
        return series.mean(skipna=True)  # take mean if multiple

In [17]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)

    a = np.sin(dphi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2.0) ** 2
    return 2 * R * np.arcsin(np.sqrt(a))

In [18]:
all_data = pd.DataFrame()
for date in dates:
  try:
    locname = date
    latitude = 40
    longitude = -75
    bbox = (longitude - 1, latitude - 1, longitude + 1, latitude + 1)
    bdate = date
    api = pyrsig.RsigApi(bdate=bdate, bbox=bbox, workdir=locname, gridfit=True)
    api_key = "eyJ0eXAiOiJKV1QiLCJvcmlnaW4iOiJFYXJ0aGRhdGEgTG9naW4iLCJzaWciOiJlZGxqd3RwdWJrZXlfb3BzIiwiYWxnIjoiUlMyNTYifQ.eyJ0eXBlIjoiVXNlciIsInVpZCI6ImFobWVkX2V6emF0IiwiZXhwIjoxNzY0NjgwNTQzLCJpYXQiOjE3NTk0OTY1NDMsImlzcyI6Imh0dHBzOi8vdXJzLmVhcnRoZGF0YS5uYXNhLmdvdiIsImlkZW50aXR5X3Byb3ZpZGVyIjoiZWRsX29wcyIsImFjciI6ImVkbCIsImFzc3VyYW5jZV9sZXZlbCI6M30.IMHG02NhfCM0uIW_Vd4FZ0yglcIi9Gx9IF1p_I0yO6bAUf2vz5ngA8GIBAHHEANHT3_nkBNFaFIRdqaYtpkm8mDDiA5nMoze7QcPr9JwPkQFFbZtpSlnr2UIBJ2av5wq3b9kQWGcnYa2D90ZxXBxQo8UcQwdtPvWeIB6l7qnLXCsbdksrrRdksOpm48tQRpZCkWyi9QQrpYgfNF9MLtW8iKDq4UMgrkf60jc7y_xcT_lydzJwU2KixS8IitLMu2TRzTMy3lWvs1y8cpmd4DtvutRgYC1cdlVJfnWtwCcPiquaCa9fHUtj_RgewrG_tR7HRbp_PhCdCVzESCqr1vK1w"
    api.tempo_kw["api_key"] = api_key
    descdf = api.descriptions()
    descdf
    dfs = []
    for data_key in keys_of_data:
      df = api.to_dataframe(data_key, backend="xdr")
      if df.empty:
        continue
      df = df.dropna()
      df["distance"] = haversine(latitude, longitude, df["LATITUDE(deg)"], df["LONGITUDE(deg)"])
      df_sorted = df.sort_values("distance").reset_index(drop=True)
      df_sorted = df_sorted[(df_sorted["LONGITUDE(deg)"] == df_sorted.iloc[0]["LONGITUDE(deg)"]) & (df_sorted["LATITUDE(deg)"] == df_sorted.iloc[0]["LATITUDE(deg)"])]
      df_sorted = df_sorted.drop(["distance"], axis=1)
      df_sorted = df_sorted.sort_values("Timestamp(UTC)").reset_index(drop=True)
      pollutant_col = [c for c in df.columns if c not in ["Timestamp(UTC)", "LONGITUDE(deg)", "LATITUDE(deg)", "STATION(-)", "SITE_NAME"]][0]
      df_sorted = df_sorted[["Timestamp(UTC)", pollutant_col]]
      dfs.append(df_sorted)
    if(len(dfs) == 0):
      continue
    df_merged = dfs[0]
    for d in dfs[1:]:
        df_merged = pd.merge(df_merged, d, on="Timestamp(UTC)", how="outer")
    df_merged = (
        df_merged
        .groupby("Timestamp(UTC)", as_index=False)
        .mean(numeric_only=True)   # average only numeric columns
    )
    df_merged = df_merged.drop("Timestamp(UTC)", axis=1)
    df_merged = df_merged.apply(unique_or_mean)
    df_merged = df_merged.to_frame().T
    df_merged["Day"] = date
    all_data = pd.concat([all_data, df_merged])
  except:
    continue


In [19]:
dfs

[             Timestamp(UTC)  o3_column_amount_o3(DU)
 0  2025-01-08T12:51:00+0000               383.985596
 1  2025-01-08T13:31:00+0000               382.872375
 2  2025-01-08T14:11:00+0000               359.779999
 3  2025-01-08T14:51:00+0000               365.455780
 4  2025-01-08T15:51:00+0000               405.702301
 5  2025-01-08T16:51:00+0000               406.225769
 6  2025-01-08T17:51:00+0000               405.633392
 7  2025-01-08T18:51:00+0000               401.174591
 8  2025-01-08T19:51:00+0000               413.395172
 9  2025-01-08T20:51:00+0000               393.222992,
              Timestamp(UTC)  no2_vertical_column_total(molecules/cm2)
 0  2025-01-08T15:04:00+0000                              4.654803e+15,
              Timestamp(UTC)  vertical_column(molecules/cm2)
 0  2025-01-08T15:04:00+0000                    1.161534e+15]

In [20]:
all_data

Unnamed: 0,o3_column_amount_o3(DU),no2_vertical_column_total(molecules/cm2),vertical_column(molecules/cm2),Day
0,328.207642,8817269000000000.0,7646712000000000.0,2025-01-02
0,373.137692,1.239351e+16,5629199000000000.0,2025-01-03
0,369.672943,7094138000000000.0,5728731000000000.0,2025-01-04
0,326.697672,8022598000000000.0,7053791000000000.0,2025-01-05
0,312.663818,2.475245e+16,2465803000000000.0,2025-01-06
0,355.04559,5127077000000000.0,3945096000000000.0,2025-01-07
0,391.744797,4654803000000000.0,1161534000000000.0,2025-01-08


In [21]:
all_data.to_csv("tempo_till_jan25.csv")

In [22]:
all_data.head()

Unnamed: 0,o3_column_amount_o3(DU),no2_vertical_column_total(molecules/cm2),vertical_column(molecules/cm2),Day
0,328.207642,8817269000000000.0,7646712000000000.0,2025-01-02
0,373.137692,1.239351e+16,5629199000000000.0,2025-01-03
0,369.672943,7094138000000000.0,5728731000000000.0,2025-01-04
0,326.697672,8022598000000000.0,7053791000000000.0,2025-01-05
0,312.663818,2.475245e+16,2465803000000000.0,2025-01-06
