In [3]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# Step 1: Define the API endpoint for the PSI
url = "https://api.data.gov.sg/v1/environment/psi"

# Step 2: Define the date range
start_date = datetime(2023, 10, 1) #YYYY-MM-DD
end_date = datetime(2024, 9, 30) #YYYY-MM-DD

# Create an empty list to store DataFrames
data_frames = []

# Step 3: Loop through the date range
current_date = start_date
while current_date <= end_date:
    # Format date in correct ISO format
    date_time_str = current_date.strftime("%Y-%m-%dT14:00:00")  # Using 14:00 for PSI data
    params = {
        "date_time": date_time_str
    }

    # Step 4: Send a GET request to the API
    response = requests.get(url, params=params)

    # Step 5: Check if the request was successful
    if response.status_code == 200:
        json_data = response.json()

        # Check if 'items' is available to avoid missing data
        items = json_data.get("items", [])
        if not items:
            print(f"No data returned for {date_time_str}.")
            current_date += timedelta(days=1)  # Move to the next date
            continue  # Continue to the next iteration

        # Step 6: Parse the JSON to extract PSI readings
        for item in items:
            timestamp = item.get("timestamp")
            readings = item.get("readings", {})
            if readings:  # Only proceed if there are readings
                # Create a DataFrame for the PSI readings
                psi_data = []
                for pollutant, values in readings.items():
                    # Create a row for each pollutant's readings per region
                    for region, value in values.items():
                        psi_data.append({
                            'pollutant': pollutant,
                            'region': region,
                            'value': value,
                            'date': pd.to_datetime(timestamp).date()  # Add the date for each reading
                        })

                # Create a DataFrame from the collected data
                psi_df = pd.DataFrame(psi_data)

                # Append the DataFrame with the date to the list
                data_frames.append(psi_df)

    else:
        print(f"Failed to fetch data for {date_time_str}. Status code: {response.status_code}")

    # Move to the next day
    current_date += timedelta(days=1)

# Step 7: Combine all DataFrames into one
if data_frames:
    combined_df = pd.concat(data_frames, ignore_index=True)

    # Preview the DataFrame
    print(combined_df.head())
    print(combined_df.info())
    combined_df.to_csv('/Users/yvonne/Downloads/psi_data.csv', index=False)  # Save to CSV file
else:
    print("No data collected.")

      pollutant    region  value        date
0  o3_sub_index      west    8.0  2023-10-01
1  o3_sub_index  national   15.0  2023-10-01
2  o3_sub_index      east    8.0  2023-10-01
3  o3_sub_index   central   15.0  2023-10-01
4  o3_sub_index     south   10.0  2023-10-01
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22272 entries, 0 to 22271
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pollutant  22272 non-null  object 
 1   region     22272 non-null  object 
 2   value      22272 non-null  float64
 3   date       22272 non-null  object 
dtypes: float64(1), object(3)
memory usage: 696.1+ KB
None
