In [1]:
import pandas as pd
import requests
from pprint import pprint as pp
import matplotlib.pyplot as plt
import copy

from datetime import datetime
from dateutil.relativedelta import relativedelta

pd.set_option("display.float_format", "{:,.2f}".format)

# Transformer

In [2]:
class HDBDataTransformer:

    def __init__(self, hdb_data):
        self.hdb_data = hdb_data

    def calculate_remaining_lease_months_row(self, row, max_lease_years: int = 99) -> int:
               
        year = row.year_month.year
        
        
        remain = 99 - (year - int(row["lease_commence_date"]))
        return remain
        
        
        """
        sale_date = pd.to_datetime(row.name)  # row index = sale date
        lease_year = int(row["lease_commence_date"])
    
        lease_start = pd.Timestamp(lease_year, 1, 1)
        lease_end = lease_start + pd.DateOffset(years=max_lease_years)
    
        remaining_months = (lease_end.year - sale_date.year) * 12 + (lease_end.month - sale_date.month)
        return remaining_months
        """

        return 0

    def price_per_sqmxxx(self,resale_price, floor_area_sqm):
        if floor_area_sqm == 0:
            return None
        return resale_price / floor_area_sqm
        
    def price_per_sqm(self, row):

        if row['floor_area_sqm'] == 0:
            return None
        return row['resale_price'] / row['floor_area_sqm']

    def flat_type_mapping(self):
        hdb_data = self.hdb_data
        #print("\t---> Mapping Flat Type")
        #print("\tBefore Mapping: ",hdb_data.flat_type.unique())
        flat_type_mapping = {
            "1 ROOM": '1RM',
            "2 ROOM": '2RM',
            "3 ROOM": '3RM',
            "4 ROOM": '4RM',
            "5 ROOM": '5RM',
            "EXECUTIVE": "EX",
            "MULTI GENERATION": "MG",
            "MULTI-GENERATION": "MG"}

        hdb_data['flat_type'] = hdb_data["flat_type"].map(flat_type_mapping)
        #print("\tAfter Mapping: ", hdb_data.flat_type.unique())

    def flat_model_mapping(self):
        hdb_data = self.hdb_data

        flat_model_mapping = {'2-ROOM': '2-ROOM',
         '2-room': '2-ROOM',
         '3Gen': '3Gen',
         'APARTMENT': 'APARTMENT',
         'Adjoined flat': 'Adjoined flat',
         'Apartment': 'APARTMENT',
         'DBSS': 'DBSS',
         'IMPROVED': 'IMPROVED',
         'IMPROVED-MAISONETTE': 'IMPROVED-MAISONETTE',
         'Improved': 'IMPROVED',
         'Improved-Maisonette': 'IMPROVED-MAISONETTE',
         'MAISONETTE': 'MAISONETTE',
         'MODEL A': 'MODEL A',
         'MODEL A-MAISONETTE': 'MODEL A-MAISONETTE',
         'MULTI GENERATION': 'MULTI GENERATION',
         'Maisonette': 'Maisonette',
         'Model A': 'MODEL A',
         'Model A-Maisonette': 'Model A-Maisonette',
         'Model A2': 'Model A2',
         'Multi Generation': 'MULTI GENERATION',
         'NEW GENERATION': 'NEW GENERATION',
         'New Generation': 'NEW GENERATION',
         'PREMIUM APARTMENT': 'PREMIUM APARTMENT',
         'Premium Apartment': 'PREMIUM APARTMENT',
         'Premium Apartment Loft': 'Premium Apartment Loft',
         'Premium Maisonette': 'Premium Maisonette',
         'SIMPLIFIED': 'SIMPLIFIED',
         'STANDARD': 'STANDARD',
         'Simplified': 'SIMPLIFIED',
         'Standard': 'STANDARD',
         'TERRACE': 'TERRACE',
         'Terrace': 'TERRACE',
         'Type S1': 'SIMPLIFIED',
         'Type S2': 'SIMPLIFIED'}

        hdb_data['flat_model'] = hdb_data["flat_model"].map(flat_model_mapping)
        
        #print("\t--> Mapping Flat Model")

    def popluate_unemployment_rate(self):
        hdb_data = self.hdb_data
        map_year_to_rate = create_unemployment_rate_mapping()
        hdb_data["unemployment_rate"] = hdb_data.index.year.map(map_year_to_rate)
    
    def popluate_inflation_rate(self):
        hdb_data = self.hdb_data
        map_year_to_rate = create_inflation_rate_mapping()
        hdb_data["inflation_rate"] = hdb_data.index.year.map(map_year_to_rate)
    
    def popluate_gdp(self):
        hdb_data = self.hdb_data
        gdp_aligned = get_gdp(hdb_data)
        hdb_data_test["gdp_usd"] = gdp_aligned
    
    #----------------------------------------------------------------------
    # Execute
    #----------------------------------------------------------------------
    def execute(self):
        
        hdb_data = self.hdb_data   

        print("--> Start of HDB Transformation")
        print("--> Calculating Remaining Months")
        hdb_data["remaining_years"] = hdb_data.apply(self.calculate_remaining_lease_months_row,axis=1)

        print("--> Calculating Per Square Meter")
        hdb_data["price_per_sqm"] = hdb_data.apply(self.price_per_sqm, axis=1)

        print("--> Perform Data Mapping for: Flat Type")
        self.flat_type_mapping()

        print("--> Perform Data Mapping for: Flat Model")
        self.flat_model_mapping()

        """
        print("--> Populate Umemployement Rate")
        self.popluate_unemployment_rate()

        print("--> Populate inflation Rate")
        self.popluate_inflation_rate()

        print("--> Populate GD{")
        self.popluate_gdp()
        """
        
        hdb_data.set_index('year_month', inplace=True)
        hdb_data.sort_index(ascending=True, inplace=True)
        self.hdb_data = hdb_data.loc['2000-01-01':'2023-12-31']
        
        
        print("--> End of HDB Transformation")
        print()

    def get_sample(self):
        return pd.concat([self.hdb_data.head(), self.hdb_data.tail()])

# Execution

In [3]:
hdb_data = pd.read_csv('datasets/Main.csv', low_memory=False, usecols=lambda col: not col.startswith("Unnamed"))
hdb_data = hdb_data.rename(columns={'month': 'year_month'})
hdb_data["year_month"] = pd.to_datetime(hdb_data["year_month"], format="%Y-%m")

In [4]:
transformer = HDBDataTransformer(hdb_data)
transformer.execute()

--> Start of HDB Transformation
--> Calculating Remaining Months
--> Calculating Per Square Meter
--> Perform Data Mapping for: Flat Type
--> Perform Data Mapping for: Flat Model
--> End of HDB Transformation



In [5]:
hdb_data.head()

Unnamed: 0_level_0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,lease_commence_date,storey_range,block,remaining_lease,address,full_address,lat,long,nearest_mrt,nearest_distance_to_mrt,remaining_years,price_per_sqm
year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1990-01-01,JURONG EAST,4RM,NEW GENERATION,97.0,PANDAN GDNS,60000.0,1978,01 TO 03,408,87,408 PANDAN GDNS,408 PANDAN GARDENS SINGAPORE 600408,1.32,103.75,jurong east,1.81,87,618.56
1990-01-01,GEYLANG,2RM,STANDARD,40.0,GEYLANG SERAI,18900.0,1971,04 TO 06,2,80,2 GEYLANG SERAI,2 GEYLANG SERAI SRI GEYLANG SERAI SINGAPORE 40...,1.32,103.9,paya lebar,0.6,80,472.5
1990-01-01,KALLANG/WHAMPOA,3RM,IMPROVED,65.0,WHAMPOA DR,46500.0,1974,07 TO 09,75,83,75 WHAMPOA DR,75 WHAMPOA DRIVE WHAMPOA COURT SINGAPORE 320075,1.32,103.86,toa payoh,1.5,83,715.38
1990-01-01,HOUGANG,EX,APARTMENT,147.0,HOUGANG ST 21,182000.0,1984,04 TO 06,237,93,237 HOUGANG ST 21,237 HOUGANG STREET 21 SINGAPORE 530237,1.36,103.89,paya lebar,4.31,93,1238.1
1990-01-01,HOUGANG,EX,APARTMENT,147.0,HOUGANG ST 21,215000.0,1984,10 TO 12,237,93,237 HOUGANG ST 21,237 HOUGANG STREET 21 SINGAPORE 530237,1.36,103.89,paya lebar,4.31,93,1462.59


<hr style="border: 2px solid red;" />
<hr style="border: 2px solid blue;" />
<hr style="border: 2px solid green;" />