# Download Missing AMSR-E Data for Existing EMIT-Aqua Coincident Pairs

**Purpose:**  
This notebook scans existing coincident data directories, reads AIRS and MODIS filenames to determine acquisition times, and downloads the corresponding AMSR-E data that was missing from the original download.

**Requirements:**
+ A NASA [Earthdata Login](https://urs.earthdata.nasa.gov/) account is required
+ Configured `.netrc` file with NASA Earthdata credentials
+ Existing coincident data directory structure from original download

---

## Import Required Packages

In [None]:
import os
import re
import subprocess
import requests
import datetime as dt
from pathlib import Path
from collections import defaultdict

## Configuration

Set the base directory where your coincident data was downloaded. The notebook will scan subdirectories to find existing AIRS/MODIS files.

In [None]:
# Base directory containing your coincident pair subdirectories
base_data_dir = '/Users/anbu8374/Downloads/coincident_data/'

# CMR API base URL
cmrurl = 'https://cmr.earthdata.nasa.gov/search/'

# AMSR-E product DOIs and concept IDs
amsr_products = {
    'AE_Rain': {
        'doi': '10.5067/AMSR-E/AE_RAIN_DAY.003',
        'concept_id': None,  # Will be fetched
        'description': 'AMSR-E/Aqua L2B Global Swath Surface Precipitation'
    },
    'AE_Ocean': {
        'doi': '10.5067/AMSR-E/AE_OCEAN.003',
        'concept_id': None,  # Will be fetched
        'description': 'AMSR-E/Aqua L2B Global Swath Ocean Products'
    }
}

# Verify base directory exists
if not os.path.exists(base_data_dir):
    raise ValueError(f"Base data directory not found: {base_data_dir}")

print(f"Base data directory: {base_data_dir}")
print(f"Directory exists: {os.path.exists(base_data_dir)}")

## Get AMSR-E Concept IDs

In [None]:
# Fetch concept IDs for AMSR-E products
for product_key, product_info in amsr_products.items():
    doi = product_info['doi']
    doisearch = cmrurl + 'collections.json?doi=' + doi
    
    try:
        response = requests.get(doisearch)
        response.raise_for_status()
        concept_id = response.json()['feed']['entry'][0]['id']
        amsr_products[product_key]['concept_id'] = concept_id
        print(f"{product_key}: {concept_id}")
        print(f"  Description: {product_info['description']}")
    except Exception as e:
        print(f"Error fetching concept ID for {product_key}: {e}")
        print(f"  DOI: {doi}")

## Parse Existing Data Directories

Scan the coincident data directories and extract timing information from AIRS and MODIS filenames.

In [None]:
def parse_aqua_filename(filename):
    """
    Extract date and time from Aqua instrument filenames.
    
    Examples:
    - AIRS: AIRS.2024.05.16.193.L2.RetStd.v7.0.7.0.G24137155634.hdf
    - MODIS: MYD021KM.A2024137.1930.061.2024138154624.hdf
    - AMSR: AMSR_E_L2A_BrightnessTemperatures_V12_202405161907_D.hdf5
    
    Returns:
        dict: {'year': int, 'month': int, 'day': int, 'hour': int, 'minute': int}
              or None if parsing fails
    """
    # AIRS format: AIRS.YYYY.MM.DD.HHH (where HHH is granule number, ~6 min each)
    airs_match = re.search(r'AIRS\.(\d{4})\.(\d{2})\.(\d{2})\.(\d{3})', filename)
    if airs_match:
        year = int(airs_match.group(1))
        month = int(airs_match.group(2))
        day = int(airs_match.group(3))
        granule = int(airs_match.group(4))
        
        # Convert granule number to approximate UTC time
        # AIRS has 240 granules per day (6 minute granules)
        minutes_since_midnight = granule * 6
        hour = minutes_since_midnight // 60
        minute = minutes_since_midnight % 60
        
        return {
            'year': year,
            'month': month,
            'day': day,
            'hour': hour,
            'minute': minute,
            'source': 'AIRS'
        }
    
    # MODIS format: MYD021KM.AYYYYDDD.HHMM
    modis_match = re.search(r'MYD\d{5}\.A(\d{4})(\d{3})\.(\d{2})(\d{2})', filename)
    if modis_match:
        year = int(modis_match.group(1))
        doy = int(modis_match.group(2))  # Day of year
        hour = int(modis_match.group(3))
        minute = int(modis_match.group(4))
        
        # Convert day of year to month and day
        date = dt.datetime(year, 1, 1) + dt.timedelta(days=doy - 1)
        
        return {
            'year': date.year,
            'month': date.month,
            'day': date.day,
            'hour': hour,
            'minute': minute,
            'source': 'MODIS'
        }
    
    return None


def create_temporal_search_string(time_info, window_minutes=30):
    """
    Create CMR temporal search string with a time window around the observation.
    
    Args:
        time_info: dict with year, month, day, hour, minute
        window_minutes: search window in minutes (default ±30 minutes)
    
    Returns:
        str: CMR temporal search string
    """
    obs_time = dt.datetime(
        time_info['year'],
        time_info['month'],
        time_info['day'],
        time_info['hour'],
        time_info['minute']
    )
    
    start_time = obs_time - dt.timedelta(minutes=window_minutes)
    end_time = obs_time + dt.timedelta(minutes=window_minutes)
    
    dt_format = '%Y-%m-%dT%H:%M:%SZ'
    return start_time.strftime(dt_format) + ',' + end_time.strftime(dt_format)


# Scan directories for existing data
print("Scanning data directories...\n")
print("=" * 70)

pair_info = {}  # Dictionary to store info for each pair

# Get all subdirectories in base_data_dir
subdirs = [d for d in Path(base_data_dir).iterdir() if d.is_dir()]

for subdir in sorted(subdirs):
    pair_name = subdir.name
    
    # Check if directory has AIRS or MODIS files but no AMSR files
    files = list(subdir.glob('*'))
    airs_files = [f for f in files if 'AIRS' in f.name and f.suffix in ['.hdf', '.nc']]
    modis_files = [f for f in files if 'MYD' in f.name and f.suffix in ['.hdf', '.nc']]
    amsr_files = [f for f in files if 'AMSR' in f.name or 'AE_' in f.name]
    
    # Skip if no AIRS/MODIS files or if AMSR files already exist
    if (not airs_files and not modis_files):
        continue
    
    if amsr_files:
        print(f"✓ {pair_name}: AMSR data already exists ({len(amsr_files)} files) - SKIPPING")
        continue
    
    # Try to parse timing from AIRS or MODIS files
    time_info = None
    source_file = None
    
    # Prefer MODIS for more precise timing
    for modis_file in modis_files:
        time_info = parse_aqua_filename(modis_file.name)
        if time_info:
            source_file = modis_file.name
            break
    
    # Fall back to AIRS if MODIS parsing failed
    if not time_info:
        for airs_file in airs_files:
            time_info = parse_aqua_filename(airs_file.name)
            if time_info:
                source_file = airs_file.name
                break
    
    if time_info:
        pair_info[pair_name] = {
            'directory': subdir,
            'time_info': time_info,
            'source_file': source_file,
            'temporal_str': create_temporal_search_string(time_info),
            'airs_count': len(airs_files),
            'modis_count': len(modis_files)
        }
        
        print(f"✗ {pair_name}: Missing AMSR data")
        print(f"    Time: {time_info['year']:04d}-{time_info['month']:02d}-{time_info['day']:02d} "
              f"{time_info['hour']:02d}:{time_info['minute']:02d} UTC (from {time_info['source']})")
        print(f"    Files: {len(airs_files)} AIRS, {len(modis_files)} MODIS")
    else:
        print(f"⚠ {pair_name}: Could not parse timing information - SKIPPING")

print("=" * 70)
print(f"\nFound {len(pair_info)} pair(s) missing AMSR data\n")

## Search and Download AMSR-E Data

For each pair missing AMSR data, search CMR for coincident AMSR-E files and download them.

In [None]:
def search_cmr_amsr(concept_id, temporal_str, page_size=2000):
    """
    Search CMR for AMSR-E granules matching temporal criteria.
    
    Returns:
        list: URLs of matching granules
    """
    granule_search_url = cmrurl + 'granules'
    
    search_params = {
        'concept_id': concept_id,
        'temporal': temporal_str,
        'page_size': page_size,
    }
    
    headers = {'Accept': 'application/json'}
    
    try:
        response = requests.get(granule_search_url, params=search_params, headers=headers)
        response.raise_for_status()
        granules = response.json()['feed']['entry']
        
        # Extract data file URLs (exclude metadata and auxiliary files)
        urls = []
        for g in granules:
            file_urls = [
                x['href'] for x in g.get('links', [])
                if 'https' in x['href']
                and any(ext in x['href'] for ext in ['.hdf', '.nc', '.h5', '.he5', '.hdf5'])
                and '.dmrpp' not in x['href']
                and not any(x['href'].endswith(f'.{digit}') for digit in '0123456789')
                and not x['href'].endswith(('.xml', '.qa', '.ph', '.html'))
            ]
            urls.extend(file_urls)
        
        return urls
    
    except Exception as e:
        print(f"    Error searching CMR: {e}")
        return []


# Search for and download AMSR data for each pair
print("=" * 70)
print("SEARCHING FOR AMSR-E DATA")
print("=" * 70)
print()

download_summary = defaultdict(list)

for pair_name, info in pair_info.items():
    print(f"Pair: {pair_name}")
    print(f"  Time: {info['temporal_str']}")
    
    pair_urls = []
    
    # Search each AMSR product
    for product_key, product_data in amsr_products.items():
        if not product_data['concept_id']:
            print(f"  - {product_key}: No concept ID available - SKIPPING")
            continue
        
        urls = search_cmr_amsr(product_data['concept_id'], info['temporal_str'])
        
        if urls:
            print(f"  - {product_key}: Found {len(urls)} file(s)")
            pair_urls.extend(urls)
        else:
            print(f"  - {product_key}: No files found")
    
    if pair_urls:
        download_summary[pair_name] = {
            'urls': pair_urls,
            'directory': info['directory'],
            'count': len(pair_urls)
        }
        print(f"  Total files to download: {len(pair_urls)}\n")
    else:
        print(f"  ⚠ No AMSR data found for this time period\n")

print("=" * 70)
print(f"Total pairs with AMSR data found: {len(download_summary)}")
print(f"Total files to download: {sum(v['count'] for v in download_summary.values())}")
print("=" * 70)
print()

## Download AMSR Files

Download the identified AMSR-E files to their respective pair directories.

In [None]:
print("=" * 70)
print("DOWNLOADING AMSR-E FILES")
print("=" * 70)
print()

total_downloaded = 0
total_failed = 0

for pair_name, download_info in download_summary.items():
    pair_dir = download_info['directory']
    urls = download_info['urls']
    
    print(f"Downloading to: {pair_name}/")
    print(f"  Files: {len(urls)}")
    
    # Create URL file for wget
    url_file = pair_dir / 'amsr_urls_to_download.txt'
    with open(url_file, 'w') as f:
        for url in urls:
            f.write(url + '\n')
    
    # Download using wget
    try:
        result = subprocess.run(
            ['wget', '-P', str(pair_dir), '-i', str(url_file)],
            capture_output=True,
            text=True
        )
        
        # Count successful downloads
        if result.stderr:
            saved_count = result.stderr.count('saved')
            total_downloaded += saved_count
            print(f"  ✓ Downloaded {saved_count} file(s)")
        
        # Clean up URL file if successful
        if result.returncode == 0:
            url_file.unlink()
        else:
            total_failed += len(urls)
            print(f"  ⚠ Download completed with warnings (return code: {result.returncode})")
            print(f"    URL file saved: {url_file.name}")
    
    except Exception as e:
        total_failed += len(urls)
        print(f"  ✗ Error downloading: {e}")
        print(f"    URLs saved to: {url_file.name}")
    
    print()

print("=" * 70)
print("DOWNLOAD COMPLETE")
print("=" * 70)
print(f"Successfully downloaded: {total_downloaded} files")
if total_failed > 0:
    print(f"Failed/warnings: {total_failed} files")
print(f"Data location: {base_data_dir}")
print("=" * 70)

## Summary

The notebook has:
1. Scanned your existing coincident data directories
2. Identified pairs missing AMSR-E data
3. Extracted timing information from AIRS/MODIS filenames
4. Searched NASA CMR for matching AMSR-E granules
5. Downloaded AMSR-E files to the appropriate directories

**Note:** If you encounter download issues, check:
- Your `.netrc` file has correct NASA Earthdata credentials
- File permissions: `chmod 600 ~/.netrc`
- Any `*_urls_to_download.txt` files left in directories indicate partial downloads that can be retried manually