In [11]:
# Updates in v2.0
# updated with timestamp ordering

In [4]:
!pip install sqlalchemy
!pip install psycopg2
!pip install sqlalchemy_utils

Collecting sqlalchemy_utils
  Downloading SQLAlchemy_Utils-0.41.2-py3-none-any.whl.metadata (4.2 kB)
Downloading SQLAlchemy_Utils-0.41.2-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sqlalchemy_utils
Successfully installed sqlalchemy_utils-0.41.2


In [14]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from sqlalchemy import create_engine, text

# Define constants
API_URL = "https://api-open.data.gov.sg/v2/real-time/api/pm25"
DB_USER = 'postgres'
DB_PASS = 'admin'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = 'data_gov_project'
START_DATE = datetime(2024, 11, 11)  # Adjust to one year ago
END_DATE = datetime(2024, 11, 17)  # Today's date or desired end date

def fetch_pm25_data_for_date(api_url, date):
    """Fetch PM2.5 data for a specific date."""
    try:
        response = requests.get(f"{api_url}?date={date}")
        if response.status_code == 200:
            json_data = response.json()
            items = json_data.get("data", {}).get("items", [])
            region_metadata = json_data.get("data", {}).get("regionMetadata", [])
            return items, region_metadata
        else:
            print(f"Failed to fetch data for {date}. Status code: {response.status_code}")
            return None, None
    except Exception as e:
        print(f"Error fetching data for {date}: {e}")
        return None, None

def process_items(items, region_metadata):
    """Process the fetched items and store PM2.5 readings in a DataFrame."""
    data_frames = []
    for item in items:
        timestamp = item.get('timestamp')
        readings = item.get('readings', {}).get('pm25_one_hourly', {})
        
        for region, value in readings.items():
            # Find region metadata
            region_info = next((r for r in region_metadata if r['name'].lower() == region.lower()), {})
            latitude = region_info.get('labelLocation', {}).get('latitude')
            longitude = region_info.get('labelLocation', {}).get('longitude')

            # Create a DataFrame for the reading
            df = pd.DataFrame({
                'region': [region],
                'pm25_value': [value],
                'timestamp': [timestamp],
                'latitude': [latitude],
                'longitude': [longitude]
            })
            data_frames.append(df)

    if data_frames:
        return pd.concat(data_frames, ignore_index=True)
    else:
        return pd.DataFrame()

def load_data_to_postgres(data_frame):
    """Load the provided DataFrame into the PostgreSQL database."""
    engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')
    try:
        data_frame.to_sql('pm25_readings', engine, if_exists='append', index=False)
        print(f"Successfully loaded {len(data_frame)} records to PostgreSQL.")
    except Exception as e:
        print(f"Error loading data into PostgreSQL: {e}")

def verify_data_in_db():
    """
    Retrieves number of rows from 'pm25_readings' table to verify data was loaded successfully.
    """
    engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')
    try:
        with engine.connect() as connection:
            result = connection.execute(text("SELECT COUNT(*) FROM pm25_readings"))
            count = result.fetchone()[0]
            print(f"Total records in 'pm25_readings' table: {count}")  # Show count of rows
    except Exception as e:
        print(f"Error verifying data in PostgreSQL: {e}")

def verify_database_connection():
    """Verifies the database connection by printing a message if successful."""
    engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')
    try:
        with engine.connect() as connection:
            print("Database connection successful")
    except Exception as e:
        print(f"Error connecting to PostgreSQL: {e}")

def create_table():
    """Create the 'pm25_readings' table in the database."""
    engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')
    try:
        with engine.connect() as connection:
            connection.execute(text("""
                CREATE TABLE IF NOT EXISTS pm25_readings (
                    id SERIAL PRIMARY KEY,
                    timestamp TIMESTAMP,
                    region VARCHAR(255),
                    pm25_value REAL,
                    latitude REAL,
                    longitude REAL
                )
            """))
            print("Table created successfully.")
    except Exception as e:
        print(f"Error creating table: {e}")

def main():
    print("Starting the script...")

    verify_database_connection()  # Verify database connection
    create_table()  # Ensure the table exists

    current_date = START_DATE
    while current_date <= END_DATE:
        date_str = current_date.strftime("%Y-%m-%d")
        print(f"Fetching data for {date_str}...")
        
        items, region_metadata = fetch_pm25_data_for_date(API_URL, date_str)
        if items and region_metadata:
            data_frame = process_items(items, region_metadata)
            if not data_frame.empty:
                data_frame['timestamp'] = pd.to_datetime(data_frame['timestamp'])
                # Sort the DataFrame by timestamp in chronological order
                data_frame = data_frame.sort_values(by='timestamp', ascending=True)
                load_data_to_postgres(data_frame)
            else:
                print(f"No data collected for {date_str}.")
        else:
            print(f"Failed to fetch data for {date_str}.")

        # Increment the date by one day
        current_date += timedelta(days=1)
    
    print("Script completed.")

if __name__ == "__main__":
    main()

Starting the script...
Database connection successful
Table created successfully.
Fetching data for 2024-11-11...
Successfully loaded 120 records to PostgreSQL.
Fetching data for 2024-11-12...
Successfully loaded 120 records to PostgreSQL.
Fetching data for 2024-11-13...
Successfully loaded 120 records to PostgreSQL.
Fetching data for 2024-11-14...
Successfully loaded 120 records to PostgreSQL.
Fetching data for 2024-11-15...
Successfully loaded 120 records to PostgreSQL.
Fetching data for 2024-11-16...
Successfully loaded 120 records to PostgreSQL.
Fetching data for 2024-11-17...
Successfully loaded 120 records to PostgreSQL.
Script completed.
