### Import Required Libraries and Set Up Environment Variables

First, we'll import the necessary libraries and load our NASA API key.

In [None]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
from datetime import datetime

# Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

# Verify API key is loaded
if NASA_API_KEY is None:
    raise ValueError("NASA_API_KEY not found in .env file")
print("API key loaded successfully")

### Part 1: Request CME data from the NASA API

First, we'll fetch and process the CME (Coronal Mass Ejection) data.

In [None]:
# Set the base URL to NASA's DONKI API
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs
specifier = "CME"

# Search for CMEs between a begin and end date
startDate = "2013-05-01"
endDate = "2024-05-01"

# Build URL for CME
query_url_CME = f"{base_url}{specifier}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"

# Make a GET request and store response
cme_response = requests.get(query_url_CME)
cme_json = cme_response.json()

# Preview first result
print("Sample CME Data:")
print(json.dumps(cme_json[0], indent=4))

In [None]:
# Convert to DataFrame
cme_df = pd.DataFrame(cme_json)

# Keep only required columns
cme_df = cme_df[['activityID', 'startTime', 'linkedEvents']]

# Remove rows with missing linkedEvents
cme_df = cme_df.dropna(subset=['linkedEvents'])

# Initialize empty list for expanded rows
expanded_rows = []

# Iterate through each row
for i in cme_df.index:
    activityID = cme_df.loc[i, 'activityID']
    startTime = cme_df.loc[i, 'startTime']
    linkedEvents = cme_df.loc[i, 'linkedEvents']
    
    # Iterate through linked events
    for event in linkedEvents:
        expanded_rows.append({
            'activityID': activityID,
            'startTime': startTime,
            'linkedEvent': event
        })

# Create DataFrame from expanded rows
cme_expanded_df = pd.DataFrame(expanded_rows)

# Display the first few rows
print("\nExpanded CME DataFrame:")
print(cme_expanded_df.head())

In [None]:
# Function to extract activityID from dict
def extract_activityID_from_dict(input_dict):
    try:
        return input_dict['activityID']
    except (ValueError, TypeError) as e:
        return None

# Test the function
print("\nTesting extract_activityID_from_dict function:")
print(extract_activityID_from_dict(cme_df.loc[0, 'linkedEvents'][0]))

# Apply function to create GST_ActivityID column
cme_expanded_df.loc[:, 'GST_ActivityID'] = cme_expanded_df['linkedEvent'].apply(extract_activityID_from_dict)

# Remove rows with missing GST_ActivityID
cme_expanded_df = cme_expanded_df.dropna(subset=['GST_ActivityID'])

# Convert columns to appropriate types
cme_expanded_df['GST_ActivityID'] = cme_expanded_df['GST_ActivityID'].astype(str)
cme_expanded_df['startTime'] = pd.to_datetime(cme_expanded_df['startTime'])

# Rename columns
cme_expanded_df = cme_expanded_df.rename(columns={
    'startTime': 'startTime_CME',
    'activityID': 'cmeID'
})

# Drop linkedEvent column
cme_expanded_df = cme_expanded_df.drop('linkedEvent', axis=1)

# Filter for GST events
cme_final_df = cme_expanded_df[cme_expanded_df['GST_ActivityID'].str.contains('GST')]

print("\nFinal CME DataFrame:")
print(cme_final_df.info())
print("\nFirst few rows:")
print(cme_final_df.head())

### Part 2: Request GST data from the NASA API

Now we'll fetch and process the GST (Geomagnetic Storm) data.

In [None]:
# Set the specifier for GSTs
specifier = "GST"

# Build URL for GST
query_url_GST = f"{base_url}{specifier}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"

# Make a GET request and store response
gst_response = requests.get(query_url_GST)
gst_json = gst_response.json()

# Preview first result
print("Sample GST Data:")
print(json.dumps(gst_json[0], indent=4))

In [None]:
# Convert to DataFrame
gst_df = pd.DataFrame(gst_json)

# Keep only required columns
gst_df = gst_df[['gstID', 'startTime', 'linkedEvents']]

# Remove rows with missing linkedEvents
gst_df = gst_df.dropna(subset=['linkedEvents'])

# Explode linkedEvents to create separate rows
gst_df = gst_df.explode('linkedEvents').reset_index(drop=True)

# Display the first few rows after explode
print("\nGST DataFrame after explode:")
print(gst_df.head())

In [None]:
# Apply extract_activityID_from_dict function
gst_df.loc[:, 'CME_ActivityID'] = gst_df['linkedEvents'].apply(extract_activityID_from_dict)

# Remove rows with missing CME_ActivityID
gst_df = gst_df.dropna(subset=['CME_ActivityID'])

# Convert columns to appropriate types
gst_df['gstID'] = gst_df['gstID'].astype(str)
gst_df['startTime'] = pd.to_datetime(gst_df['startTime'])

# Rename startTime
gst_df = gst_df.rename(columns={'startTime': 'startTime_GST'})

# Drop linkedEvents
gst_df = gst_df.drop('linkedEvents', axis=1)

# Filter for CME events
gst_final_df = gst_df[gst_df['CME_ActivityID'].str.contains('CME')]

print("\nFinal GST DataFrame:")
print(gst_final_df.info())
print("\nFirst few rows:")
print(gst_final_df.head())

### Part 3: Merge and Clean the Data for Export

Finally, we'll merge the CME and GST data and calculate the time differences.

In [None]:
# Merge CME and GST DataFrames
merged_df = pd.merge(
    gst_final_df,
    cme_final_df,
    left_on=['gstID', 'CME_ActivityID'],
    right_on=['GST_ActivityID', 'cmeID']
)

# Verify row counts
print("Row counts:")
print(f"CME DataFrame: {len(cme_final_df)}")
print(f"GST DataFrame: {len(gst_final_df)}")
print(f"Merged DataFrame: {len(merged_df)}")

# Calculate time difference in hours
merged_df['timeDiff'] = (merged_df['startTime_GST'] - merged_df['startTime_CME']).dt.total_seconds() / 3600

# Show time difference statistics
print("\nTime Difference Statistics (hours):")
print(merged_df['timeDiff'].describe())

# Display first few rows of final dataset
print("\nFirst few rows of merged data:")
print(merged_df.head())

# Export to CSV
output_path = "6-output/collected_data.csv"
merged_df.to_csv(output_path, index=False)
print(f"\nData exported to: {output_path}")