# AI Data Enrichment (UNOOSA ➜ INTLDES→NORAD ➜ N2YO TLE)

This notebook incrementally builds an enriched dataset by:
- Normalizing UNOOSA international designators
- Building an INTLDES→NORAD crosswalk from SATCAT
- Merging UNOOSA with NORAD IDs
- Fetching N2YO TLEs (sample) with caching/backoff
- Parsing TLEs into orbit features
- Merging features back for analysis

Each cell performs one focused action and prints a small sample of results.

## 0) Setup

- Ensure `config/.env` contains `N2YO_API_KEY` (copy from `config/.env.example`).
- This notebook loads UNOOSA data and a SATCAT CSV, merges to NORAD IDs, fetches N2YO TLEs (sample), parses features, and merges back.
- Each cell is independent and focuses on one action.

In [3]:
# 1) Imports & paths
import os
from pathlib import Path
import pandas as pd

# Project-relative paths
ROOT = Path("..")  # this notebook is inside notebooks/
UNOOSA_PATH = ROOT / "data" / "processed" / "df_unoosa.csv"
SATCAT_PATH = ROOT / "data" / "raw" / "satcat.csv"  # replace if your satcat file is named differently

print("UNOOSA_PATH:", UNOOSA_PATH)
print("SATCAT_PATH:", SATCAT_PATH)


UNOOSA_PATH: ../data/processed/df_unoosa.csv
SATCAT_PATH: ../data/raw/satcat.csv


In [4]:
# 2) Load UNOOSA processed CSV (preview)
unoosa = pd.read_csv(UNOOSA_PATH)
print("UNOOSA rows, cols:", unoosa.shape)
unoosa.head(3)


UNOOSA rows, cols: (21289, 23)


Unnamed: 0,id,uri,international_designator,international_designator_off,national_designator,space_object_name,space_object_name_2,state_of_registry,state_of_registry_off,date_of_launch,...,status_off,date_of_decay,date_of_launch_off,date_of_decay_off,function,remarks,external_website,registration_doc,gso_location_off,symbol
0,"102,en,/osoindex/data/objects/2025/2025-085q_2...",/osoindex/data/objects/2025/2025-085q_24495.html,2025-085Q,False,,STARLINK 33861,,USA,False,2025-04-28,...,False,,False,False,------,Not registered with the United Nations. Date o...,,,True,
1,"102,en,/osoindex/data/objects/2025/2025-085s_2...",/osoindex/data/objects/2025/2025-085s_24497.html,2025-085S,False,,STARLINK 33887,,USA,False,2025-04-28,...,False,,False,False,------,Not registered with the United Nations. Date o...,,,True,
2,"102,en,/osoindex/data/objects/2025/2025-085t_2...",/osoindex/data/objects/2025/2025-085t_24498.html,2025-085T,False,,STARLINK 33886,,USA,False,2025-04-28,...,False,,False,False,------,Not registered with the United Nations. Date o...,,,True,


In [None]:
# 3) Load SATCAT CSV (preview) — if you don't have it, download it first
satcat = pd.read_csv(SATCAT_PATH)
print("SATCAT rows, cols:", satcat.shape)
satcat.head(3)


In [None]:
# 4) Build crosswalk (INTLDES→NORAD) and inspect
from src.crosswalk import build_satcat_crosswalk, merge_unoosa_with_crosswalk

satcat_xwalk = build_satcat_crosswalk(satcat)
satcat_xwalk.head(5)


In [None]:
# 5) Merge UNOOSA with crosswalk to get NORAD IDs
merged = merge_unoosa_with_crosswalk(unoosa, satcat_xwalk, unoosa_intldes_col="international_designator")

print("Merged rows, cols:", merged.shape)
merged[["international_designator", "intldes", "norad_id", "satcat_satname"]].head(10)


In [None]:
# 6) Fetch a small sample of TLEs via N2YO (uses N2YO_API_KEY in config/.env)
from src.n2yo_client import N2YOClient

# Pick a small set to respect rate limits during exploration
sample_ids = (
    merged["norad_id"].dropna().astype(int).drop_duplicates().head(10).tolist()
)
print("Sample NORAD IDs:", sample_ids)

client = N2YOClient(cache_dir="cache_tle")

tle_rows = []
for nid in sample_ids:
    try:
        d = client.get_tle(int(nid))
        tle_rows.append({
            "norad_id": int(nid),
            "n2yo_satname": (d.get("info", {}) or {}).get("satname"),
            "tle_one_line": d.get("tle"),
            "n2yo_txn_last_60min": (d.get("info", {}) or {}).get("transactionscount"),
        })
    except Exception as e:
        tle_rows.append({"norad_id": int(nid), "error": str(e)})

sample_tle_df = pd.DataFrame(tle_rows)
sample_tle_df.head(5)


In [None]:
# 7) Parse TLE features for the sample
from src.tle_parse import parse_tle_fields

feat_rows = []
for _, r in sample_tle_df.iterrows():
    feats = parse_tle_fields(r.get("tle_one_line"))
    feats["norad_id"] = r["norad_id"]
    feat_rows.append(feats)

sample_tle_feats = pd.DataFrame(feat_rows)
sample_tle_feats.head(5)


In [None]:
# 8) Merge sample TLE + features back to merged UNOOSA
sample_enriched = (
    merged.merge(
        sample_tle_df[["norad_id", "n2yo_satname", "tle_one_line", "n2yo_txn_last_60min"]],
        on="norad_id", how="left"
    ).merge(sample_tle_feats, on="norad_id", how="left")
)

print("Sample enriched rows, cols:", sample_enriched.shape)
sample_enriched.head(10)


In [None]:
# 9) Full enrichment (optional) — runs end-to-end and writes outputs
from src.enrich_unoosa import enrich_with_n2yo

enriched, tle_df, tle_feats_df = enrich_with_n2yo(
    unoosa_df=unoosa,
    satcat_csv_path=SATCAT_PATH,
    cache_dir="cache_tle",
)

# Save outputs
out_dir = ROOT / "data" / "processed"
out_dir.mkdir(parents=True, exist_ok=True)

enriched.to_csv(out_dir / "enriched_unoosa_n2yo.csv", index=False)
tle_df.to_csv(out_dir / "n2yo_tle_raw.csv", index=False)
tle_feats_df.to_csv(out_dir / "n2yo_tle_features.csv", index=False)

print("Wrote:")
print(" -", out_dir / "enriched_unoosa_n2yo.csv")
print(" -", out_dir / "n2yo_tle_raw.csv")
print(" -", out_dir / "n2yo_tle_features.csv")
