In [1]:
# fetch NASA data
from src.client.api_client import fetch_neo_data
data = fetch_neo_data()

In [2]:
# store raw data in S3
from src.db.aws_client import AWSClient
aws_client = AWSClient()
aws_client.save_data_to_s3(data)

[92m[INFO] 2026-01-01 20:56:24 - Successfully uploaded data to AWS S3: s3://nasa-neo-pipeline/raw-neo/2026-01-01.json[0m


In [3]:
# transform and clean data for storage
from src.transform.flatten_neo import extract_neo
flat_data = extract_neo(data)

[92m[INFO] 2026-01-01 20:56:25 - Successfully flattened NEO data, except for close approach data.[0m


In [4]:
# create a separate list for close approaches
from src.transform.clean_approaches import get_new_neo_approaches

neo, approaches = get_new_neo_approaches(flat_data)

In [5]:
# store both neo data and approaches data in pandas dataframes
import pandas as pd
neo_df = pd.DataFrame(neo["neo"])
approaches_df = pd.DataFrame(approaches)

In [6]:
neo_df.head()

Unnamed: 0,reference_id,neo_name,nasa_jpl_url,absolute_magnitude_h,estimated_diameter_min_km,estimated_diameter_max_km,is_potentially_hazardous,is_sentry_object
0,2259221,259221 (2003 BA21),https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,19.19,0.385971,0.863058,False,False
1,3022973,(1999 TN13),https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,23.6,0.050647,0.11325,False,False
2,3837745,(2019 AR7),https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,26.7,0.012149,0.027167,False,False
3,54215203,(2021 VF),https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,27.85,0.007154,0.015997,False,False
4,54297798,(2022 QN5),https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,25.83,0.018137,0.040555,False,False


In [7]:
approaches_df.head()

Unnamed: 0,reference_id,close_approach_date_epoch,relative_velocity_kms,miss_distance_km,orbiting_body
0,2259221,1767846240000,26.110554,29738120.0,Earth
1,3022973,1767889320000,15.487924,42958190.0,Earth
2,3837745,1767876420000,16.878495,67667500.0,Earth
3,54215203,1767911460000,7.892092,35145530.0,Earth
4,54297798,1767879000000,20.944791,36219000.0,Earth


In [8]:
# standardize NEO names to a format: "YYYY identifier"
import re

regex = r"\d{4} [\w\d]+"
neo_df["clean_neo_name"] = neo_df["neo_name"].apply(
  lambda name: (
    re.search(regex, name).group()  # type: ignore
    if re.search(regex, name) is not None 
    else name
  )
)

neo_df.insert(2, "clean_neo_name", neo_df.pop("clean_neo_name"))
neo_df.head()

Unnamed: 0,reference_id,neo_name,clean_neo_name,nasa_jpl_url,absolute_magnitude_h,estimated_diameter_min_km,estimated_diameter_max_km,is_potentially_hazardous,is_sentry_object
0,2259221,259221 (2003 BA21),2003 BA21,https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,19.19,0.385971,0.863058,False,False
1,3022973,(1999 TN13),1999 TN13,https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,23.6,0.050647,0.11325,False,False
2,3837745,(2019 AR7),2019 AR7,https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,26.7,0.012149,0.027167,False,False
3,54215203,(2021 VF),2021 VF,https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,27.85,0.007154,0.015997,False,False
4,54297798,(2022 QN5),2022 QN5,https://ssd.jpl.nasa.gov/tools/sbdb_lookup.htm...,25.83,0.018137,0.040555,False,False


In [9]:
# store both datasets in Postgres
from src.db.sql_client import SQLClient
sql_client = SQLClient()

sql_client.store_neo_data(neo_df)
sql_client.store_approach_data(approaches_df)

sql_client.close()

[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 2259221[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 3022973[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 3837745[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 54215203[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 54297798[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 54417555[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 54417548[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 54422164[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 54427938[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 54567021[0m
[92m[INFO] 2026-01-01 20:56:28 - Updated data stored in database for NEO: 54567035[0m
[92m[INFO] 2026-01-01 20:56:28 - U