<h1 style="font-size:20px">Team Generation</h1>
<ul style="font-size:20px">
  <li>Item One</li>
  <li>Item Two</li>
  <li>Item Three</li>
</ul>

# Import Libarries

In [1]:
import pandas as pd
import requests
from pprint import pprint as pp
import matplotlib.pyplot as plt
import copy

from datetime import datetime
from dateutil.relativedelta import relativedelta

pd.set_option("display.float_format", "{:,.2f}".format)

In [2]:
class Downloader():

    def __init__(self):
        self.nrecords = 0
        self.nfiles = 0
        self.getMetadata()

    def getMetadata(self):
        collection_id = 189          
        url = "https://api-production.data.gov.sg/v2/public/api/collections/{}/metadata".format(collection_id)
                
        response = requests.get(url)
        child_datasets = response.json()['data']['collectionMetadata']['childDatasets']

        self.child_datasets = child_datasets
        return child_datasets

    def download_dataset(self, child_id):
        
        poll_url = f"https://api-open.data.gov.sg/v1/public/api/datasets/{child_id}/poll-download"
        print(poll_url)

        headers = {}
        
        resp = requests.get(poll_url, headers=headers)
        data = resp.json()
        
        if data.get("code") != 0:
            raise RuntimeError(f"Download failed: {data.get('errorMsg') or data}")
        
        download_url = data["data"]["url"]
        #simple_url= "/".join(download_url.split("/")[3:5]) + "..."
    
        output_filename = f"DataLake/hdb_data_{child_id}.csv"
        file_resp = requests.get(download_url)
        with open(output_filename, "wb") as f:
            f.write(file_resp.content)

        s = file_resp.content.decode("utf-8")
        nrecords = len(s.split("\n")) - 2
        self.nrecords += nrecords
        self.nfiles += 1
        print(f"Downloading File#..: {self.index+1}")
        print(f"Download Child ID..: {child_id}")
        print(f"Saved to...........: {output_filename}")
        print(f"Number of records..: {nrecords}")
        print("-" * 100)

    def execute(self):
        child_datasets = self.getMetadata()
        for index, child_id in enumerate(child_datasets):
            self.index = index
            self.download_dataset(child_id)


# Execution

In [3]:
downloader = Downloader()
#downloader.child_datasets
downloader.execute()
print(f"Total Number of File......: {downloader.nfiles} downloaded")
print(f"Total Number of Reconds...: {downloader.nrecords} downloaded")

https://api-open.data.gov.sg/v1/public/api/datasets/d_8b84c4ee58e3cfc0ece0d773c8ca6abc/poll-download
Downloading File#..: 1
Download Child ID..: d_8b84c4ee58e3cfc0ece0d773c8ca6abc
Saved to...........: DataLake/hdb_data_d_8b84c4ee58e3cfc0ece0d773c8ca6abc.csv
Number of records..: 220597
----------------------------------------------------------------------------------------------------
https://api-open.data.gov.sg/v1/public/api/datasets/d_43f493c6c50d54243cc1eab0df142d6a/poll-download
Downloading File#..: 2
Download Child ID..: d_43f493c6c50d54243cc1eab0df142d6a
Saved to...........: DataLake/hdb_data_d_43f493c6c50d54243cc1eab0df142d6a.csv
Number of records..: 369651
----------------------------------------------------------------------------------------------------
https://api-open.data.gov.sg/v1/public/api/datasets/d_2d5ff9ea31397b66239f245f57751537/poll-download
Downloading File#..: 3
Download Child ID..: d_2d5ff9ea31397b66239f245f57751537
Saved to...........: DataLake/hdb_data_d_2d5ff

# System Level Verification

In [4]:
f1 = !cat DataLake/hdb_data_d_8b84c4ee58e3cfc0ece0d773c8ca6abc.csv | grep -v "month,town,flat_type" | wc -l
f2 = !cat DataLake/hdb_data_d_43f493c6c50d54243cc1eab0df142d6a.csv | grep -v "month,town,flat_type" | wc -l
f3 = !cat DataLake/hdb_data_d_2d5ff9ea31397b66239f245f57751537.csv | grep -v "month,town,flat_type" | wc -l
f4 = !cat DataLake/hdb_data_d_ebc5ab87086db484f88045b47411ebc5.csv | grep -v "month,town,flat_type" | wc -l
f5 = !cat DataLake/hdb_data_d_ea9ed51da2787afaf8e51f827c304208.csv | grep -v "month,town,flat_type" | wc -l

counts = [f1, f2, f3, f4, f5]
counts = [int(count[0].strip()) for count in counts]
print(f"All Counts.....: ",counts)
print(f"Total Counts...: {sum(counts)} records" )

All Counts.....:  [220597, 369651, 52203, 287196, 37153]
Total Counts...: 966800 records


# Combine Raw File From DataLake
- Write to Staging: Main.csv

In [5]:
def combineDownloadedFiles():
    child_datasets = downloader.child_datasets
    
    #x = []
    counts = []
    df_hdbs = []
    
    for collection_id in child_datasets:
        df = pd.read_csv(f"DataLake/hdb_data_{collection_id}.csv")
        if "remaining_lease" in df.columns:
            del df["remaining_lease"]
        #x.append(df.iloc[[0, 1, 2, 3, 4]]) #---Only take samples
        counts.append(len(df))
        df_hdbs.append(df)
    
    hdb_data = pd.concat(df_hdbs)
    hdb_data["year_month"] = pd.to_datetime(hdb_data["month"], format="%Y-%m")
    hdb_data = hdb_data.sort_values("year_month")
    hdb_data = hdb_data.set_index("year_month")
    hdb_data = hdb_data.drop('month', axis=1)

    hdb_data.to_csv("Staging/Main.csv")
    print("Combined File Written in Staging/Main.csv")
    print("Total: ", sum(counts))
    print()
    
    return hdb_data

hdb_data = combineDownloadedFiles()

Combined File Written in Staging/Main.csv
Total:  966800



# Show Sample Records

In [6]:
hdb_data

Unnamed: 0_level_0,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.00,IMPROVED,1977,9000.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,44,BENDEMEER RD,04 TO 06,63.00,STANDARD,1981,31400.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,20,ST. GEORGE'S RD,04 TO 06,67.00,NEW GENERATION,1984,66500.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,14,KG ARANG RD,04 TO 06,103.00,NEW GENERATION,1984,77000.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,46,OWEN RD,01 TO 03,68.00,NEW GENERATION,1982,58000.00
...,...,...,...,...,...,...,...,...,...
2025-12-01,BEDOK,5 ROOM,139,BEDOK NTH AVE 3,10 TO 12,117.00,Standard,1979,710000.00
2025-12-01,CENTRAL AREA,3 ROOM,635,VEERASAMY RD,04 TO 06,72.00,Model A,1985,550000.00
2025-12-01,PUNGGOL,4 ROOM,308B,PUNGGOL WALK,10 TO 12,92.00,Premium Apartment,2016,715000.00
2025-12-01,BEDOK,5 ROOM,516,BEDOK NTH AVE 2,07 TO 09,119.00,Improved,1979,700000.00


# Save File into Staging

In [7]:
hdb_data.to_csv("staging/Main.csv")