### Import Required Libraries and Set Up Environment Variables

In [1]:
# Dependencies
import requests
import time
from dotenv import load_dotenv
import os
import pandas as pd
import json
import os
from datetime import datetime
## Load the NASA_API_KEY from the env file
load_dotenv()
NASA_API_KEY = os.getenv('NASA_API_KEY')

# Prints API key to ensure it is loaded correctly
print(os.getenv("NASA_API_KEY")) 

z3afBLJBtzh2onHmXA0KJ88TWtfVkzfEnCARzbg7


### CME Data

In [2]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for CMEs:
CME = "CME"

# Search for CMEs published between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for CME
query_url_CME = f"https://api.nasa.gov/DONKI/CME?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"


In [3]:
# Make a "GET" request for the CME URL and store it in a variable named cme_response
cme_response = requests.get(query_url_CME)

In [4]:
# Convert the response variable to json and store it as a variable named cme_json
cme_json = cme_response.json()

In [5]:
# Preview ONLY the first element from the cme_json list you created in JSON format
# Do NOT print out the entire list
# Use json.dumps with argument indent=4 to format data
print(json.dumps(cme_json[0], indent=4))

{
    "activityID": "2013-05-01T03:12:00-CME-001",
    "catalog": "M2M_CATALOG",
    "startTime": "2013-05-01T03:12Z",
    "instruments": [
        {
            "displayName": "SOHO: LASCO/C2"
        },
        {
            "displayName": "SOHO: LASCO/C3"
        },
        {
            "displayName": "STEREO A: SECCHI/COR2"
        },
        {
            "displayName": "STEREO B: SECCHI/COR2"
        }
    ],
    "sourceLocation": "",
    "activeRegionNum": null,
    "note": "",
    "submissionTime": "2013-08-07T16:54Z",
    "versionId": 1,
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/CME/2349/-1",
    "cmeAnalyses": [
        {
            "isMostAccurate": true,
            "time21_5": "2013-05-01T07:07Z",
            "latitude": 12.0,
            "longitude": -120.0,
            "halfAngle": 36.0,
            "speed": 860.0,
            "type": "C",
            "featureCode": "null",
            "imageType": null,
            "measurementTechnique": "null",
   

In [6]:
# Convert cme_json to a Pandas DataFrame 
# # Keep only the columns: activityID, startTime, linkedEvents
cme_df = pd.DataFrame(cme_json)[["activityID", "startTime", "linkedEvents"]]


In [7]:
# Notice that the linkedEvents column allows us to identify the corresponding GST
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to GSTs
cme_df.dropna(subset=["linkedEvents"], inplace=True)


In [8]:
# Initialize an empty list to store the expanded rows
expanded_rows = []

# Notice that the linkedEvents sometimes contains multiple events per row
# Write a nested for loop that iterates first over each row in the cme DataFrame (using the index)
# and then iterates over the values in 'linkedEvents' 
# and adds the elements individually to a list of dictionaries where each row is one element
# Iterate over each index in the DataFrame
# Iterate over each dictionary in the list
# Append a new dictionary to the expanded_rows list for each dictionary item and corresponding 'activityID' and 'startTime' value
# Create a new DataFrame from the expanded rows

for i in cme_df.index:
    activity_id = cme_df.loc[i, "activityID"]
    start_time = cme_df.loc[i, "startTime"]
    linked_events = cme_df.loc[i, "linkedEvents"]

    for event in linked_events:
        expanded_rows.append({
            "cmeID": activity_id,
            "startTime": start_time,
            "linkedEvent": event
        })

cme_expanded_df = pd.DataFrame(expanded_rows)

In [9]:
# Create a function called extract_activityID_from_dict that takes a dict as input such as in linkedEvents
# and verify below that it works as expected using one row from linkedEvents as an example
# Be sure to use a try and except block to handle errors
 # Log the error or print it for debugging

def extract_activityID_from_dict(input_dict):
        try:
                return input_dict.get("ActivityID", None)
        except (ValueError, TypeError, AttributeError):
                return None


print(cme_expanded_df.columns)

Index(['cmeID', 'startTime', 'linkedEvent'], dtype='object')


In [10]:
# Apply this function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'GST_ActivityID' using loc indexer:
cme_expanded_df["GST_ActivityID"] = cme_expanded_df["linkedEvent"].apply(extract_activityID_from_dict)

In [11]:
# Remove rows with missing GST_ActivityID, since we can't assign them to GSTs:
cme_expanded_df.dropna(subset=["GST_ActivityID"], inplace=True)

In [12]:
# print out the datatype of each column in this DataFrame:
print(cme_expanded_df.dtypes)


cmeID             object
startTime         object
linkedEvent       object
GST_ActivityID    object
dtype: object


In [13]:
# Convert the 'GST_ActivityID' column to string format 
cme_expanded_df["GST_ActivityID"] = cme_expanded_df["GST_ActivityID"].astype(str)

# Convert startTime to datetime format
cme_expanded_df.rename(columns={"startTime_CME": "startTime_CME", "cmeID": "cmeID"}, inplace=True)

# Rename startTime_CME to startTime and activityID to cmeID
cme_expanded_df.rename(columns={"startTime_CME": "startTime", "activityID": "cmeID"}, inplace=True)

# Drop linkedEvents
# dropped above when we created the expanded_rows list; breaks code otherwise

# Verify that all steps were executed correctly
print(cme_expanded_df.head())
print(cme_expanded_df.dtypes)

Empty DataFrame
Columns: [cmeID, startTime, linkedEvent, GST_ActivityID]
Index: []
cmeID             object
startTime         object
linkedEvent       object
GST_ActivityID    object
dtype: object


In [14]:
# We are only interested in CMEs related to GSTs so keep only rows where the GST_ActivityID column contains 'GST'
# use the method 'contains()' from the str library.  
cme_expanded_df = cme_expanded_df[cme_expanded_df["GST_ActivityID"].str.contains("GST")]

In [15]:
# Adding final check
print(f"Final CME dataframe shape: {cme_expanded_df.shape}")
print(cme_expanded_df.head())

Final CME dataframe shape: (0, 4)
Empty DataFrame
Columns: [cmeID, startTime, linkedEvent, GST_ActivityID]
Index: []


### GST Data

In [16]:
# Set the base URL to NASA's DONKI API:
base_url = "https://api.nasa.gov/DONKI/"

# Set the specifier for Geomagnetic Storms (GST):
GST = "GST"

# Search for GSTs between a begin and end date
startDate = "2013-05-01"
endDate   = "2024-05-01"

# Build URL for GST
query_url_GST = f"{base_url}{GST}?startDate={startDate}&endDate={endDate}&api_key={NASA_API_KEY}"

In [17]:
# Make a "GET" request for the GST URL and store it in a variable named gst_response
gst_response = requests.get(query_url_GST)

In [18]:
# Convert the response variable to json and store it as a variable named gst_json
gst_json = gst_response.json()

In [19]:
# Preview ONLY the first element from the gst_json list you created in JSON format
# Do NOT print out the entire list
# Use json.dumps with argument indent=4 to format data
print(json.dumps(gst_json[0], indent=4))

{
    "gstID": "2013-06-01T01:00:00-GST-001",
    "startTime": "2013-06-01T01:00Z",
    "allKpIndex": [
        {
            "observedTime": "2013-06-01T01:00Z",
            "kpIndex": 6.0,
            "source": "NOAA"
        }
    ],
    "link": "https://webtools.ccmc.gsfc.nasa.gov/DONKI/view/GST/326/-1",
    "linkedEvents": [
        {
            "activityID": "2013-05-31T15:45:00-HSS-001"
        }
    ],
    "submissionTime": "2013-07-15T19:26Z",
    "versionId": 1
}


In [20]:
# Convert gst_json to a Pandas DataFrame
# Keep only the columns: gstID, startTime, linkedEvents
gst_df = pd.DataFrame(gst_json)[["gstID", "startTime", "linkedEvents"]]  


In [21]:
# Notice that the linkedEvents column allows us to identify the corresponding CME
# Remove rows with missing 'linkedEvents' since we won't be able to assign these to CME
gst_df.dropna(subset=["linkedEvents"], inplace=True)

In [22]:
# Notice that the linkedEvents sometimes contains multiple events per row
# Use the explode method to ensure that each row is one element. Ensure to reset the index and drop missing values.
gst_df = gst_df.explode("linkedEvents").reset_index(drop=True)


In [23]:
# Apply the extract_activityID_from_dict function to each row in the 'linkedEvents' column (you can use apply() and a lambda function)
# and create a new column called 'CME_ActivityID' using loc indexer:
gst_df["CME_ActivityID"] = gst_df["linkedEvents"].apply(extract_activityID_from_dict)

# Remove rows with missing CME_ActivityID, since we can't assign them to CMEs:
gst_df.dropna(subset=["CME_ActivityID"], inplace=True)


In [24]:
# Convert the 'CME_ActivityID' column to string format 
gst_df["CME_ActivityID"] = gst_df["CME_ActivityID"].astype(str)

# Convert the 'gstID' column to string format
gst_df["gstID"] = gst_df["gstID"].astype(str) 

# Convert startTime to datetime format  
gst_df["startTime"] = pd.to_datetime(gst_df["startTime"])

# Rename startTime to startTime_GST
gst_df.rename(columns={"startTime": "startTime_GST"}, inplace=True) 

# Drop linkedEvents
if "linkedEvents" in gst_df.columns:
    gst_df.drop(columns=["linkedEvents"], inplace=True)
else:
    print("Warning: 'linkedEvents' column not found, skipping drop.")

# Verify that all steps were executed correctly
print(gst_df.head())
print(gst_df.dtypes)


Empty DataFrame
Columns: [gstID, startTime_GST, CME_ActivityID]
Index: []
gstID                     object
startTime_GST     datetime64[ns]
CME_ActivityID            object
dtype: object


In [25]:
# We are only interested in GSTs related to CMEs so keep only rows where the CME_ActivityID column contains 'CME'
# use the method 'contains()' from the str library.  
gst_df = gst_df[gst_df["CME_ActivityID"].str.contains("CME")]

### Merge both datatsets

In [32]:
# Now merge both datasets using 'gstID' and 'CME_ActivityID' for gst and 'GST_ActivityID' and 'cmeID' for cme. Use the 'left_on' and 'right_on' specifiers.
merged_df = gst_df.merge(
                    cme_expanded_df,
                    left_on="CME_ActivityID",
                    right_on="GST_ActivityID",
                    how="inner")

In [33]:
# Verify that the new DataFrame has the same number of rows as cme and gst
print(f"Merged dataset shape: {merged_df.shape}")
print(merged_df.head())

Merged dataset shape: (0, 7)
Empty DataFrame
Columns: [gstID, startTime_GST, CME_ActivityID, cmeID, startTime, linkedEvent, GST_ActivityID]
Index: []


### Computing the time it takes for a CME to cause a GST

In [28]:
# Compute the time diff between startTime_GST and startTime_CME by creating a new column called `timeDiff`.


In [29]:
# Use describe() to compute the mean and median time 
# that it takes for a CME to cause a GST. 


### Exporting data in csv format

In [30]:
# Export data to CSV without the index
