In [5]:
# Updates in v3.0
# from v2.0, uses "_links", "start" and "next" at the bottom of JSON file to get the next link and page
# can test for smallet dataset with max_pages=10

# Updates in v4.0
# removed create_table function
# edited column names to fit postgreSQL table design
# created resale_id -> combined_df['resale_id'] = range(1, len(combined_df) + 1)

In [4]:
!pip install sqlalchemy
!pip install psycopg2
!pip install sqlalchemy_utils

Collecting sqlalchemy_utils
  Downloading SQLAlchemy_Utils-0.41.2-py3-none-any.whl.metadata (4.2 kB)
Downloading SQLAlchemy_Utils-0.41.2-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: sqlalchemy_utils
Successfully installed sqlalchemy_utils-0.41.2


In [19]:
import requests  # For sending HTTP requests
import pandas as pd  # For data manipulation
from datetime import datetime, timedelta  # For handling dates
from dateutil.relativedelta import relativedelta
from sqlalchemy import create_engine, text  # For database operations

# Define constants
API_URL = "https://data.gov.sg/api/action/datastore_search?resource_id=d_8b84c4ee58e3cfc0ece0d773c8ca6abc"  # API URL
DB_USER = 'postgres'  # Update with your PostgreSQL username
DB_PASS = 'admin'  # Update with your PostgreSQL password
DB_HOST = 'localhost'  # Update with your database host
DB_PORT = '5432'  # Update with your database port
DB_NAME = 'data_gov_project'  # Update with your PostgreSQL database name
# START_DATE = datetime(2024, 9, 1)  # Adjust as needed (yyyy, mm, dd, hh, mm)
# END_DATE = datetime(2024, 10, 31)  # Adjust as needed (yyyy, mm, dd, hh, mm)

def fetch_data_from_api(api_url, max_pages=5):
    """Fetch data from the API, stopping after a set number of pages for testing."""
    data_frames = []  # List to store DataFrames for each batch of data
    next_url = api_url
    page_count = 0  # Track the number of pages fetched

    while next_url and page_count < max_pages:
        response = requests.get(next_url)
        if response.status_code == 200:
            json_data = response.json()
            records = json_data.get("result", {}).get("records", [])
            if records:
                process_items(records, data_frames)

            # Check for the next page link
            next_link = json_data.get("result", {}).get("_links", {}).get("next")
            if next_link:
                next_url = f"https://data.gov.sg{next_link}"
            else:
                next_url = None

            page_count += 1  # Increment page count
        else:
            print(f"Failed to fetch data. Status code: {response.status_code}")
            next_url = None

    return data_frames
    
def process_items(items, data_frames):
    """Process the fetched items and store data in DataFrames."""
    for item in items:
        # Extract relevant fields from the item
        record = {
            'resale_date': item.get('month'),
            'town_name': item.get('town'),
            'flat_type': item.get('flat_type'),
            'block_no': item.get('block'),
            'street_name': item.get('street_name'),
            'storey_range': item.get('storey_range'),
            'floor_area_sqm': item.get('floor_area_sqm'),
            'flat_model': item.get('flat_model'),
            'lease_commence_year': item.get('lease_commence_date'),
            'remaining_lease': item.get('remaining_lease'),
            'resale_price': item.get('resale_price'),
        }

        # Create a DataFrame from the record and append it to the list
        df = pd.DataFrame([record])
        data_frames.append(df)

def load_data_to_postgres(data_frame):
    """Load the provided DataFrame into the PostgreSQL database."""
    engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')
    try:
        data_frame.to_sql('resale_flat_txn', engine, if_exists='append', index=False)
        print(f"Successfully loaded {len(data_frame)} records to PostgreSQL.")
    except Exception as e:
        print(f"Error loading data into PostgreSQL: {e}")

def verify_data_in_db():
    """Verifies the data in the PostgreSQL database."""
    engine = create_engine(f'postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}')
    try:
        with engine.connect() as connection:
            result = connection.execute(text("SELECT COUNT(*) FROM resale_flat_txn"))
            count = result.fetchone()[0]
            print(f"Total records in 'resale_flat_txn' table: {count}")
    except Exception as e:
        print(f"Error verifying data in PostgreSQL: {e}")

def main():
    """Main function to execute the script."""
    print("Starting the script...")

    data_frames = fetch_data_from_api(API_URL)  # Fetch the data
    print(f"Fetched data frames: {len(data_frames)}")

    if data_frames:
        combined_df = pd.concat(data_frames, ignore_index=True)  # Combine the DataFrames

        # Clean up DataFrame (if necessary)
        combined_df['resale_date'] = pd.to_datetime(combined_df['resale_date'], errors='coerce')  # Convert month to datetime
        combined_df = combined_df.loc[:, ~combined_df.columns.duplicated()]  # Remove duplicates
        combined_df = combined_df[['resale_date', 'town_name', 'flat_type', 'block_no', 'street_name', 
                                   'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_year', 
                                   'remaining_lease', 'resale_price']]  # Rearrange columns

        combined_df['resale_id'] = range(1, len(combined_df) + 1)
        load_data_to_postgres(combined_df)  # Load the data into PostgreSQL

        verify_data_in_db()  # Verify the data in the database
    else:
        print("No data collected.")
    
    print("Script completed.")

if __name__ == "__main__":
    main()

Starting the script...
Fetched data frames: 500
Successfully loaded 500 records to PostgreSQL.
Total records in 'resale_flat_txn' table: 500
Script completed.
