In [1]:
import pandas as pd
import requests
from pprint import pprint as pp
import matplotlib.pyplot as plt
import copy

from datetime import datetime
from dateutil.relativedelta import relativedelta

pd.set_option("display.float_format", "{:,.2f}".format)

In [2]:
class Downloader():

    def __init__(self):
        self.getMetadata()

    def getMetadata(self):
        collection_id = 189          
        url = "https://api-production.data.gov.sg/v2/public/api/collections/{}/metadata".format(collection_id)
                
        response = requests.get(url)
        child_datasets = response.json()['data']['collectionMetadata']['childDatasets']

        self.child_datasets = child_datasets
        print(222)
        return child_datasets

    def download_dataset(self, child_id):
        
        poll_url = f"https://api-open.data.gov.sg/v1/public/api/datasets/{child_id}/poll-download"
        print(poll_url)

        headers = {}
        
        resp = requests.get(poll_url, headers=headers)
        data = resp.json()
        
        if data.get("code") != 0:
            raise RuntimeError(f"Download failed: {data.get('errorMsg') or data}")
        
        download_url = data["data"]["url"]
        #simple_url= "/".join(download_url.split("/")[3:5]) + "..."
    
        output_filename = f"hdb_data_{child_id}.csv"    
        file_resp = requests.get(download_url)
        with open(output_filename, "wb") as f:
            f.write(file_resp.content)

        s = file_resp.content.decode("utf-8")
        nrecords = len(s.split("\n")) - 2
        print(f"Downloading File#..: {self.index+1}")
        print(f"Download Child ID..: {child_id}")
        print(f"Saved to...........: {output_filename}")
        print(f"Number of records..: {nrecords}")
        print("-" * 80)

    def execute(self):
        child_datasets = self.getMetadata()
        for index, child_id in enumerate(child_datasets):
            self.index = index
            self.download_dataset(child_id)


In [3]:
def create_unemployment_rate_mapping():

    df = pd.read_csv("unemployment_rate.csv")

    df2 = df.iloc[[0]]
    df2= df2.drop('DataSeries', axis=1)
 
    df3 = df2.transpose()
    df3.reset_index(inplace=True)
    df3.columns = ['year','rate']
    df3.sort_values(by="year", inplace=True)
    
    first_rate = float(df3.iloc[0].values[1])
    last_rate = float(df3.iloc[-1].values[1])
    
    top2  = pd.DataFrame([[1990, first_rate],
                          [1991, first_rate]])
    top2.columns=['year', 'rate']
    
    bottom1 = pd.DataFrame([[2025, last_rate]])
    bottom1.columns=['year', 'rate']
    
    df3 = pd.concat([top2, df3, bottom1])
    df3['rate'] = df3['rate'].astype(float)
    df3['year'] = pd.to_datetime(df3.year, format="%Y")
    df3.set_index('year', inplace=True)
    df3.sort_index(ascending=True, inplace=True)

    map_year_to_rate = df3["rate"].groupby(df3.index.year).first()
    return map_year_to_rate

In [4]:
def create_inflation_rate_mapping():

    #df = pd.read_csv("unemployment_rate.csv")
    df = pd.read_csv("Singapore-Inflation-Rate.csv")


    #df2 = df.iloc[[0]]
    #df3 = df2.transpose()
    df3 = df
    df3.sort_values(by="year", inplace=True, ascending=True)
   
    df3['rate'] = df3['rate'].astype(float)
    df3['year'] = pd.to_datetime(df3.year, format="%Y")
    df3.set_index('year', inplace=True)
    df3.sort_index(ascending=True, inplace=True)

    map_year_to_rate = df3["rate"].groupby(df3.index.year).first()
    return map_year_to_rate
    
    create_inflation_rate_mapping()

In [5]:
downloader = Downloader()
downloader.child_datasets
#downloader.execute()

222


['d_8b84c4ee58e3cfc0ece0d773c8ca6abc',
 'd_43f493c6c50d54243cc1eab0df142d6a',
 'd_2d5ff9ea31397b66239f245f57751537',
 'd_ebc5ab87086db484f88045b47411ebc5',
 'd_ea9ed51da2787afaf8e51f827c304208']

In [6]:
f1 = !cat hdb_data_d_8b84c4ee58e3cfc0ece0d773c8ca6abc.csv | grep -v "month,town,flat_type" | wc -l
f2 = !cat hdb_data_d_43f493c6c50d54243cc1eab0df142d6a.csv | grep -v "month,town,flat_type" | wc -l
f3 = !cat hdb_data_d_2d5ff9ea31397b66239f245f57751537.csv | grep -v "month,town,flat_type" | wc -l
f4 = !cat hdb_data_d_ebc5ab87086db484f88045b47411ebc5.csv | grep -v "month,town,flat_type" | wc -l
f5 = !cat hdb_data_d_ea9ed51da2787afaf8e51f827c304208.csv | grep -v "month,town,flat_type" | wc -l

counts = [f1, f2, f3, f4, f5]
counts = [int(count[0].strip()) for count in counts]
print(f"All Counts.....: ",counts)
print(f"Total Counts...: ",sum(counts))

All Counts.....:  [220394, 369651, 52203, 287196, 37153]
Total Counts...:  966597


In [7]:
child_datasets = downloader.child_datasets

x = []
counts = []
df_hdbs = []

for collection_id in child_datasets:
    df = pd.read_csv(f"hdb_data_{collection_id}.csv")
    if "remaining_lease" in df.columns:
        del df["remaining_lease"]
    x.append(df.iloc[[0, 1, 2, 3, 4]]) #---Only take samples
    counts.append(len(df))
    df_hdbs.append(df)
print("Total: ", sum(counts))
hdb_data = pd.concat(df_hdbs)
hdb_data["year_month"] = pd.to_datetime(hdb_data["month"], format="%Y-%m")
hdb_data = hdb_data.sort_values("year_month")
hdb_data = hdb_data.set_index("year_month")
hdb_data = hdb_data.drop('month', axis=1)

hdb_data.to_csv("HDB_Resale_Price.csv")

hdb_data

Total:  966597


Unnamed: 0_level_0,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.00,IMPROVED,1977,9000.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,44,BENDEMEER RD,04 TO 06,63.00,STANDARD,1981,31400.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,20,ST. GEORGE'S RD,04 TO 06,67.00,NEW GENERATION,1984,66500.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,14,KG ARANG RD,04 TO 06,103.00,NEW GENERATION,1984,77000.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,46,OWEN RD,01 TO 03,68.00,NEW GENERATION,1982,58000.00
...,...,...,...,...,...,...,...,...,...
2025-12-01,SENGKANG,4 ROOM,184B,RIVERVALE CRES,13 TO 15,90.00,Model A,2003,560000.00
2025-12-01,BUKIT PANJANG,3 ROOM,454,FAJAR RD,04 TO 06,68.00,Model A,2015,435000.00
2025-12-01,BUKIT PANJANG,3 ROOM,106,GANGSA RD,07 TO 09,73.00,Model A,1988,465000.00
2025-12-01,TAMPINES,4 ROOM,859A,TAMPINES AVE 5,10 TO 12,104.00,Model A,1988,715000.00


# Transformer

In [8]:
class HDBDataTransformer:

    def __init__(self, hdb_data):
        self.hdb_data = hdb_data

    def calculate_remaining_lease_months_row(self, row, max_lease_years: int = 99) -> int:
        sale_date = pd.to_datetime(row.name)  # row index = sale date
        lease_year = int(row["lease_commence_date"])
    
        lease_start = pd.Timestamp(lease_year, 1, 1)
        lease_end = lease_start + pd.DateOffset(years=max_lease_years)
    
        remaining_months = (lease_end.year - sale_date.year) * 12 + (lease_end.month - sale_date.month)
        return remaining_months

    def price_per_sqmxxx(self,resale_price, floor_area_sqm):
        if floor_area_sqm == 0:
            return None
        return resale_price / floor_area_sqm
        
    def price_per_sqm(self, row):

        if row['floor_area_sqm'] == 0:
            return None
        return row['resale_price'] / row['floor_area_sqm']

    def flat_type_mapping(self):
        hdb_data = self.hdb_data
        print("\t---> Mapping Flat Type")
        print("\tBefore Mapping: ",hdb_data.flat_type.unique())
        flat_type_mapping = {
            "1 ROOM": '1RM',
            "2 ROOM": '2RM',
            "3 ROOM": '3RM',
            "4 ROOM": '4RM',
            "5 ROOM": '5RM',
            "EXECUTIVE": "EX",
            "MULTI GENERATION": "MG",
            "MULTI-GENERATION": "MG"}

        hdb_data['flat_type'] = hdb_data["flat_type"].map(flat_type_mapping)
        print("\tAfter Mapping: ", hdb_data.flat_type.unique())

    def flat_model_mapping(self):
        hdb_data = self.hdb_data

        flat_model_mapping = {'2-ROOM': '2-ROOM',
         '2-room': '2-ROOM',
         '3Gen': '3Gen',
         'APARTMENT': 'APARTMENT',
         'Adjoined flat': 'Adjoined flat',
         'Apartment': 'APARTMENT',
         'DBSS': 'DBSS',
         'IMPROVED': 'IMPROVED',
         'IMPROVED-MAISONETTE': 'IMPROVED-MAISONETTE',
         'Improved': 'IMPROVED',
         'Improved-Maisonette': 'IMPROVED-MAISONETTE',
         'MAISONETTE': 'MAISONETTE',
         'MODEL A': 'MODEL A',
         'MODEL A-MAISONETTE': 'MODEL A-MAISONETTE',
         'MULTI GENERATION': 'MULTI GENERATION',
         'Maisonette': 'Maisonette',
         'Model A': 'MODEL A',
         'Model A-Maisonette': 'Model A-Maisonette',
         'Model A2': 'Model A2',
         'Multi Generation': 'MULTI GENERATION',
         'NEW GENERATION': 'NEW GENERATION',
         'New Generation': 'NEW GENERATION',
         'PREMIUM APARTMENT': 'PREMIUM APARTMENT',
         'Premium Apartment': 'PREMIUM APARTMENT',
         'Premium Apartment Loft': 'Premium Apartment Loft',
         'Premium Maisonette': 'Premium Maisonette',
         'SIMPLIFIED': 'SIMPLIFIED',
         'STANDARD': 'STANDARD',
         'Simplified': 'SIMPLIFIED',
         'Standard': 'STANDARD',
         'TERRACE': 'TERRACE',
         'Terrace': 'TERRACE',
         'Type S1': 'SIMPLIFIED',
         'Type S2': 'SIMPLIFIED'}

        hdb_data['flat_model'] = hdb_data["flat_model"].map(flat_model_mapping)
        
        print("\t--> Mapping Flat Model")

    def popluate_unemployment_rate(self):
        hdb_data = self.hdb_data
        map_year_to_rate = create_unemployment_rate_mapping()
        hdb_data["unemployment_rate"] = hdb_data.index.year.map(map_year_to_rate)
    
    def popluate_inflation_rate(self):
        hdb_data = self.hdb_data
        map_year_to_rate = create_inflation_rate_mapping()
        hdb_data["unemployment_rate"] = hdb_data.index.year.map(map_year_to_rate)
    

    #----------------------------------------------------------------------
    # Execute
    #----------------------------------------------------------------------
    def execute(self):
        
        hdb_data = self.hdb_data   

        print("--> HDB Transformation Start")
        print("--> Calculating Remaining Months")
        hdb_data["remaining_years"] = hdb_data.apply(self.calculate_remaining_lease_months_row,axis=1)

        print("--> Calculating Per Square Meter")
        hdb_data["price_per_sqm"] = hdb_data.apply(self.price_per_sqm, axis=1)

        print("--> Perform Data Mapping for: Flat Type")
        self.flat_type_mapping()

        print("--> Perform Data Mapping for: Flat Model")
        self.flat_model_mapping()

        print("--> Populate Umemployement Rate")
        self.popluate_unemployment_rate()

        print("--> Populate inflation Rate")
        self.popluate_unemployment_rate()
        
        print("--> HDB Transformation End")
        print()
        
    

hdb_data_test = copy.copy(hdb_data)
hdb_data_test

Unnamed: 0_level_0,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.00,IMPROVED,1977,9000.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,44,BENDEMEER RD,04 TO 06,63.00,STANDARD,1981,31400.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,20,ST. GEORGE'S RD,04 TO 06,67.00,NEW GENERATION,1984,66500.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,14,KG ARANG RD,04 TO 06,103.00,NEW GENERATION,1984,77000.00
1990-01-01,KALLANG/WHAMPOA,3 ROOM,46,OWEN RD,01 TO 03,68.00,NEW GENERATION,1982,58000.00
...,...,...,...,...,...,...,...,...,...
2025-12-01,SENGKANG,4 ROOM,184B,RIVERVALE CRES,13 TO 15,90.00,Model A,2003,560000.00
2025-12-01,BUKIT PANJANG,3 ROOM,454,FAJAR RD,04 TO 06,68.00,Model A,2015,435000.00
2025-12-01,BUKIT PANJANG,3 ROOM,106,GANGSA RD,07 TO 09,73.00,Model A,1988,465000.00
2025-12-01,TAMPINES,4 ROOM,859A,TAMPINES AVE 5,10 TO 12,104.00,Model A,1988,715000.00


In [None]:
transformer = HDBDataTransformer(hdb_data_test)
transformer.execute()
hdb_data_test

--> HDB Transformation Start
--> Calculating Remaining Months
--> Calculating Per Square Meter


In [None]:
#hdb_data.head()
sorted(hdb_data.flat_model.unique())

In [None]:
#hdb_data_test.head()
sorted(hdb_data_test.flat_model.unique())


In [None]:
hdb_data_test.flat_type.value_counts()

In [None]:
hdb_data_test.flat_model.value_counts()

In [None]:
hdb_data.flat_type.unique()

flat_type_mapping = {
    "1 ROOM": '1RM',
    "2 ROOM": '2RM',
    "3 ROOM": '3RM',
    "4 ROOM": '4RM',
    "5 ROOM": '5RM',
    "EXECUTIVE": "EX",
    "MULTI GENERATION": "MG",
    "MULTI-GENERATION": "MG",
}

hdb_data['flat_type'] = hdb_data["flat_type"].map(flat_type_mapping)

In [None]:
x = [["1 ROOM", '1RM'],
["2 ROOM", '2RM'],
["3 ROOM", '3RM'],
["4 ROOM", '4RM'],
["5 ROOM", '5RM'],
["EXECUTIVE", "EX"],
["MULTI GENERATION", "MG"],
["MULTI-GENERATION", "MG"]]
pd.DataFrame(x, columns=['From', 'To'])

In [None]:
convert_dict = {
    '2-room': "2-ROOM",
    'Improved': 'IMPROVED',
    'Improved-Maisonette': 'IMPROVED-MAISONETTE',
    'Model A': 'MODEL A',
    'New Generation': 'NEW GENERATION',
    'Simplified': 'SIMPLIFIED',
    'Premium Apartment': 'PREMIUM APARTMENT',
    'Multi Generation': 'MULTI GENERATION',
    'Terrace': 'TERRACE',
    'Standard': 'STANDARD',
    'Apartment': 'APARTMENT',
}

flat_model = {}

for key in sorted(df.flat_model.unique()):
    convert_to = key
    if key in convert_dict:
        convert_to = convert_dict[key]
    
    flat_model[key] = convert_to

flat_model

In [None]:
flat_model_mapping = {'2-ROOM': '2-ROOM',
                     '2-room': '2-ROOM',
                     '3Gen': 'MULTI GENERATION',
                     'APARTMENT': 'APARTMENT',
                     'Adjoined flat': 'SPECIAL FLAT',
                     'Apartment': 'APARTMENT',
                     'DBSS': 'DBSS',
                     'IMPROVED': 'IMPROVED',
                     'IMPROVED-MAISONETTE': 'MAISONETTE',
                     'Improved': 'IMPROVED',
                     'Improved-Maisonette': 'MAISONETTE',
                     'MAISONETTE': 'MAISONETTE',
                     'MODEL A': 'MODEL A',
                     'MODEL A-MAISONETTE': 'MAISONETTE',
                     'MULTI GENERATION': 'MULTI GENERATION',
                     'Maisonette': 'MAISONETTE',
                     'Model A': 'MODEL A',
                     'Model A-Maisonette': 'MAISONETTE',
                     'Model A2': 'MODEL A',
                     'Multi Generation': 'MULTI GENERATION',
                     'NEW GENERATION': 'NEW GENERATION',
                     'New Generation': 'NEW GENERATION',
                     'PREMIUM APARTMENT': 'PREMIUM APARTMENT',
                     'Premium Apartment': 'PREMIUM APARTMENT',
                     'Premium Apartment Loft': 'PREMIUM APARTMENT',
                     'Premium Maisonette': 'MAISONETTE',
                     'SIMPLIFIED': 'SIMPLIFIED',
                     'STANDARD': 'STANDARD',
                     'Simplified': 'SIMPLIFIED',
                     'Standard': 'STANDARD',
                     'TERRACE': 'SPECIAL FLAT',
                     'Terrace': 'SPECIAL FLAT',
                     'Type S1': 'SIMPLIFIED',
                     'Type S2': 'SIMPLIFIED'}

hdb_data['flat_model'] = hdb_data["flat_model"].map(flat_model_mapping)

In [None]:
x = []
for key, value in flat_model_mapping.items():
    x.append([key, value])
pd.DataFrame(x,  columns=['From', 'To'])

In [None]:
#for x in sorted(hdb_data.flat_model.unique()):
#    print(x)

pd.DataFrame(hdb_data.flat_model.value_counts())

# Save File

In [None]:
hdb_data.to_csv("HDB_Resale_Price.csv")

hdb_data

# Read The Dataset

In [None]:
df = pd.read_csv("HDB_Resale_Price.csv", index_col="year_month")
df = df.sort_values("year_month")
df

In [None]:
#df_year_2025.flat_type.unique()
#df_year_2025
#sorted(hdb_data.flat_model.unique())

#df_year_period2 = hdb_data.loc['2023-01':'2024-12']
#ax = df_year_period2.resample("M")["resale_price"].mean().plot()
#ax.set_xticklabels(df_year_period2.index.strftime('%Y-%m'), rotation=45)
#plt.show()

df_year_period2 = hdb_data.loc['2024-01':'2024-12']
monthly_avg = df_year_period2.resample("M")["resale_price"].mean()

ax = monthly_avg.plot(figsize=(12,5))

# Set tick positions first (required)
ax.set_xticks(monthly_avg.index)

# Set tick labels
ax.set_xticklabels(monthly_avg.index.strftime('%Y-%m'), rotation=45)

plt.show()

In [None]:
df_period2 = hdb_data.loc['2025-01':'2025-12']
monthly_count = df_period2.resample("M").size()
print(monthly_count)
ax = monthly_count.plot(figsize=(12,5))

plt.grid()

# Force all months to show
ax.set_xticks(monthly_count.index)
ax.set_xticklabels(monthly_count.index.strftime('%Y-%m'), rotation=45)

plt.show()

In [None]:
df_period2["town"].hist(bins=20, figsize=(12,5))
plt.xticks(rotation=80)
plt.show()

In [None]:
hdb_data["town"].hist(bins=20, figsize=(12,5))
plt.xticks(rotation=80)   # rotate x labels by 45°
plt.show()

In [None]:
df = hdb_data.loc['2024-06':'2024-12']
#df.flat_type.value_count()
counts = df.flat_type.value_counts()
counts.plot(kind="pie", autopct="%1.1f%%", figsize=(6,6))
plt.ylabel("")  # remove y-label
plt.show()

In [None]:
hdb_data

In [None]:
def get_gdp(hdb_data_test):

    url_gdp = "https://fred.stlouisfed.org/graph/fredgraph.csv?id=MKTGDPSGA646NWDB"
    df_gdp = pd.read_csv(url_gdp, parse_dates=["observation_date"])
    
    df_gdp.rename(columns={ "observation_date": "year",
                            "MKTGDPSGA646NWDB": "gdp_usd"},
                 inplace=True)
    
    df_gdp.set_index("year", inplace=True)

    df_gdp.to_csv("singapore_gdp.csv")

    # Make sure indexes are datetime and sorted
    #hdb_data_test = hdb_data_test.copy()
    hdb_data_test.index = pd.to_datetime(hdb_data_test.index)
    hdb_data_test = hdb_data_test.sort_index()
    
    #df_gdp_1990 = df_gdp.copy()
    #df_gdp_1990.index = pd.to_datetime(df_gdp_1990.index)
    #df_gdp_1990 = df_gdp.sort_index()
    
    # Get the GDP series
    gdp_series = df_gdp["gdp_usd"]
    
    # Reindex GDP to the HDB index, forward-filling the last known value
    gdp_aligned = gdp_series.reindex(hdb_data_test.index, method="ffill")
    
    # Attach as new column
    hdb_data_test["gdp_usd"] = gdp_aligned
        
    return df_gdp, hdb_data_test

df_gdp, hdb_data_test = get_gdp(hdb_data_test)
hdb_data_test

In [None]:
import requests
import pandas as pd

dataset_id = "d_b816a930bca0eb19fdf20fcbfcdd4c39"

url = "https://data.gov.sg/api/action/datastore_search"
params = {
    "resource_id": dataset_id,
    "limit": 5000,  # big enough to get all quarters
}

resp = requests.get(url, params=params)
data = resp.json()   # this *should* be valid JSON for this endpoint

records = data["result"]["records"]
df_ue_q = pd.DataFrame(records)

print(resp.status_code)
print(resp.text[:500])
print(df_ue_q.columns.tolist())

df_ue_q

In [None]:
def create_unemployment_rate_mapping():

    df = pd.read_csv("unemployment_rate.csv")

    df2 = df.iloc[[0]]
    df2= df2.drop('DataSeries', axis=1)
 
    df3 = df2.transpose()
    df3.reset_index(inplace=True)
    df3.columns = ['year','rate']
    df3.sort_values(by="year", inplace=True)
    
    first_rate = float(df3.iloc[0].values[1])
    last_rate = float(df3.iloc[-1].values[1])
    
    top2  = pd.DataFrame([[1990, first_rate],
                          [1991, first_rate]])
    top2.columns=['year', 'rate']
    
    bottom1 = pd.DataFrame([[2025, last_rate]])
    bottom1.columns=['year', 'rate']
    
    df3 = pd.concat([top2, df3, bottom1])
    df3['rate'] = df3['rate'].astype(float)
    df3['year'] = pd.to_datetime(df3.year, format="%Y")
    df3.set_index('year', inplace=True)
    df3.sort_index(ascending=True, inplace=True)

    map_year_to_rate = df3["rate"].groupby(df3.index.year).first()
    return map_year_to_rate

In [None]:



map_year_to_rate = create_inflation_rate_mapping()
hdb_data_test["inflation_rate"] = hdb_data_test.index.year.map(map_year_to_rate)

In [None]:
hdb_data_test

In [None]:
df = pd.read_csv("Singapore-Inflation-Rate.csv")
df

In [None]:
df_inflation_rate

In [None]:
#hdb_data_test.drop("unemployment_rate", axis=1, inplace=True)
hdb_data_test

In [None]:
df_cpi