# Load Filenames and Times for TEMPO Data

This notebook:
1) Pulls the filenames for all available L3 files from s3
2) Pulls the filenames for all L2 files for swaths that cross over the bounding box
3) Map L2 to L3 files
4) Calculate L3 times by taking the midpoint of the swath times (halfway between beginning of first swath and end of last swath)


In [1]:
import boto3
import xesmf as xe
import xarray as xr
import netCDF4
import pandas as pd
import argparse
import base64
import boto3
import json
import requests
import os
import platform
from subprocess import Popen
import shutil
from datetime import datetime
import re
import pandas as pd
import earthaccess
from load_credentials import *
from bbox import *

# Define event with credentials and S3 endpoint details
event = {
    's3_endpoint': 'https://data.asdc.earthdata.nasa.gov/s3credentials',  # replace with actual endpoint
    'edl_username': 'amanda.murray19',  # replace with your EDL username
    'edl_password': 'Sat_modeling_berk2024',  # replace with your EDL password
    'bucket_name': 'asdc-prod-protected/TEMPO/TEMPO_NO2_L3_V03'  # replace with your bucket name
}

## Get all L3 file names from s3

In [2]:
# Remove .netrc file if it exists
netrc_path = os.path.expanduser('~/.netrc')  # Expands to the user's home directory
if os.path.exists(netrc_path):
    print(".netrc file exists.")
    os.remove(netrc_path)
    print(".netrc file has been removed.")
else:
    print(".netrc file does not exist.")# Retrieve credentials

creds = retrieve_credentials(event)

# Use the credentials to access the S3 bucket
client = boto3.client('s3',
    aws_access_key_id=creds["accessKeyId"],
    aws_secret_access_key=creds["secretAccessKey"],
    aws_session_token=creds["sessionToken"])

bucket_name = 'asdc-prod-protected'
prefix = 'TEMPO/TEMPO_NO2_L3_V03/'

client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# List objects in the bucket
response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
tempo_l3_files = []

# Loop to handle pagination if there are more than 1000 objects
while response.get('Contents'):
    # Append the object keys to the list
    tempo_l3_files.extend([r["Key"] for r in response['Contents']])

    # Check if there is a next page of results
    if response.get('NextContinuationToken'):
        response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix, ContinuationToken=response['NextContinuationToken'])
    else:
        break

# Pull only L3 files
tempo_l3_files_nc = []

for file_name in tempo_l3_files:
    if file_name.endswith('.nc'):
        tempo_l3_files_nc.append(file_name)

.netrc file does not exist.
<Response [307]>


## Get all L3 file names from s3

In [3]:
# Write files to a text file
with open("/home/jupyter/.netrc", "w") as f:
   f.write("""machine urs.earthdata.nasa.gov
login amanda.murray19
password Sat_modeling_berk2024""")

# Establishing access to EarthData,
auth = earthaccess.login(strategy="netrc", persist=True)

# Get current date and time
current_datetime = datetime.now()
formatted_current_datetime = current_datetime.strftime('%Y-%m-%d %H:%M:%S')

#Set up collection information along with timeframe and region of interest
short_name = 'TEMPO_NO2_L2' # collection name to search for in the EarthData
date_start = '2022-05-01 00:00:00' # start date
date_end = formatted_current_datetime # end date
bbox = (lon_min, lat_min, lon_max, lat_max) # 1 degree bounding box around POI

# search for granules
results = earthaccess.search_data(short_name = short_name\
, temporal = (date_start, date_end)
, bounding_box = bbox)

file_pattern = re.compile(r'/(\d{4}\.\d{2}\.\d{2}/TEMPO_NO2_L2_V03_\d{8}T\d{6}Z_S\d+G\d+\.nc)')
datetime_pattern = re.compile(r'BeginningDateTime\': \'([^\']+)\', \'EndingDateTime\': \'([^\']+)')

# Initialize a list to store the data
data = []

# Iterate through your results
for result in results:  # Adjust `results[:4]` as needed
    files = file_pattern.findall(str(result))
    datetimes = datetime_pattern.findall(str(result))

    # Ensure both files and datetimes have matches before proceeding
    if files and datetimes:
        for file, (begin, end) in zip(files, datetimes):
            # Append each record as a tuple (or dictionary)
            data.append({
                "L2_File": file,
                "BeginningDateTime": begin,
                "EndingDateTime": end
            })

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Convert BeginningDateTime and EndingDateTime columns to timestamps
df['BeginningDateTime'] = pd.to_datetime(df['BeginningDateTime'])
df['EndingDateTime'] = pd.to_datetime(df['EndingDateTime'])

# Extract the date (20230809) and file number (S011) using regular expressions
df['Date'] = df['L2_File'].str.extract(r'_(\d{8})T')
df['FileNumber'] = df['L2_File'].str.extract(r'_(S\d+)')

# Set beginning_time as the time of first swath and ending_time as end of last swath
df_grouped = df.groupby(['Date', 'FileNumber']).agg({'L2_File':'count', 'BeginningDateTime':'min','EndingDateTime':'max'}).reset_index()
# Make Time as the midpoint
df_grouped['Time'] = df_grouped['BeginningDateTime']+(df_grouped['EndingDateTime'] - df_grouped['BeginningDateTime'])/2
df_grouped['Time'] = df_grouped['Time'].round('S')

# Create a DataFrame from the list of lines
df_L3 = pd.DataFrame(tempo_l3_files_nc, columns=['FilePath'])
df_L3['Date'] = df_L3['FilePath'].str.extract(r'_(\d{8})T')
df_L3['FileNumber'] = df_L3['FilePath'].str.extract(r'_(S\d+)')

# Full dataset
full_dataset = pd.merge(df_L3, df_grouped, on=['Date', 'FileNumber'], how='inner')

In [4]:
# Convert to central
full_dataset['timestamp_ct'] = full_dataset['Time'].dt.tz_convert('America/Chicago')
full_dataset['date_central'] = full_dataset['timestamp_ct'].dt.date

# Remove the timezone part from the string (keeping only the date and time)
full_dataset['timestamp_ct'] = full_dataset['timestamp_ct'].dt.strftime('%Y-%m-%d %H:%M:%S')
full_dataset['timestamp_ct'] = full_dataset['timestamp_ct'].str[:19]

# Convert the 'timestamp_ct' column to datetime
full_dataset['timestamp_ct'] = pd.to_datetime(full_dataset['timestamp_ct'])
full_dataset['Hour'] = full_dataset['timestamp_ct'].dt.hour
full_dataset['Minute'] = full_dataset['timestamp_ct'].dt.minute
full_dataset =full_dataset.sort_values('timestamp_ct')

In [5]:
full_dataset.to_csv("../../data/tempo_data/no2_tempo_files_df.csv")

In [7]:
full_dataset

Unnamed: 0,FilePath,Date,FileNumber,L2_File,BeginningDateTime,EndingDateTime,Time,timestamp_ct,date_central,Hour,Minute
0,TEMPO/TEMPO_NO2_L3_V03/2023.08.02/TEMPO_NO2_L3...,20230802,S001,3,2023-08-02 15:37:41+00:00,2023-08-02 15:56:20+00:00,2023-08-02 15:47:00.500000+00:00,2023-08-02 10:47:00,2023-08-02,10,47
1,TEMPO/TEMPO_NO2_L3_V03/2023.08.02/TEMPO_NO2_L3...,20230802,S002,3,2023-08-02 16:40:12+00:00,2023-08-02 16:58:51+00:00,2023-08-02 16:49:31.500000+00:00,2023-08-02 11:49:31,2023-08-02,11,49
2,TEMPO/TEMPO_NO2_L3_V03/2023.08.02/TEMPO_NO2_L3...,20230802,S003,3,2023-08-02 17:42:43+00:00,2023-08-02 18:01:22+00:00,2023-08-02 17:52:02.500000+00:00,2023-08-02 12:52:02,2023-08-02,12,52
3,TEMPO/TEMPO_NO2_L3_V03/2023.08.02/TEMPO_NO2_L3...,20230802,S004,3,2023-08-02 18:45:14+00:00,2023-08-02 19:03:53+00:00,2023-08-02 18:54:33.500000+00:00,2023-08-02 13:54:33,2023-08-02,13,54
4,TEMPO/TEMPO_NO2_L3_V03/2023.08.02/TEMPO_NO2_L3...,20230802,S005,3,2023-08-02 19:47:45+00:00,2023-08-02 20:06:24+00:00,2023-08-02 19:57:04.500000+00:00,2023-08-02 14:57:04,2023-08-02,14,57
...,...,...,...,...,...,...,...,...,...,...,...
9498,TEMPO/TEMPO_NO2_L3_V03/2025.06.27/TEMPO_NO2_L3...,20250627,S004,3,2025-06-27 12:48:02+00:00,2025-06-27 13:07:59+00:00,2025-06-27 12:58:00.500000+00:00,2025-06-27 07:58:00,2025-06-27,7,58
9499,TEMPO/TEMPO_NO2_L3_V03/2025.06.27/TEMPO_NO2_L3...,20250627,S005,3,2025-06-27 13:28:16+00:00,2025-06-27 13:48:07+00:00,2025-06-27 13:38:11.500000+00:00,2025-06-27 08:38:11,2025-06-27,8,38
9500,TEMPO/TEMPO_NO2_L3_V03/2025.06.27/TEMPO_NO2_L3...,20250627,S006,3,2025-06-27 14:28:16+00:00,2025-06-27 14:48:07+00:00,2025-06-27 14:38:11.500000+00:00,2025-06-27 09:38:11,2025-06-27,9,38
9501,TEMPO/TEMPO_NO2_L3_V03/2025.06.27/TEMPO_NO2_L3...,20250627,S007,3,2025-06-27 15:28:16+00:00,2025-06-27 15:48:07+00:00,2025-06-27 15:38:11.500000+00:00,2025-06-27 10:38:11,2025-06-27,10,38
