## üìò HDB Resale Flat Prices

### üìå Notebook Description

- **Team:** Team A  
- **Members:** Ben, Shazlin, Alan  
- **Project Name:** HDB Resale Flat Data Engineering Pipeline
- **Description:** Implements automated data ingestion from data.gov.sg and performs dataset merging to produce a unified, analysis-ready dataset.
- **Data Artifacts:**  
    - `/DataLake/<raw files>`  
    - `/Staging/Main.csv`

### üì¶ Import Required Libraries

In [1]:
import pandas as pd
import copy as copy

import control_output

#---Customized-----------------------------------------
import control_output
pd.set_option("display.float_format", "{:,.2f}".format)
control_output.css

### üß© Define Class: HDBDataTransformer

In [2]:
class FileProcessor:

    def __init__(self):
        config = {
            "unemployment.csv": {
                'transpose': True,
                'format': '%Y'
            },
            "gdp.csv":  {
                'transpose': False,
                'format': '%Y-%m-%d'
            }, 
            "Inflation.csv": {
                'transpose': False,
                'format': '%Y'
            },
            "Births.csv": {
                'transpose': True,
                'format': '%Y %b'
            },
            'Divorces.csv': {
                'transpose': True,
                'format': '%Y'
            },
            'Marriages.csv': {
                'transpose': True,
                'format': '%Y'
            },
            'ResidentHousehold.csv': {
                'transpose': True,
                'format': '%Y'
            }            
        }
        self.config = config

    def execute(self, filename, nrows=None, skiprows=9, index_name='year_month'):
        self.filename = filename
        
        inputFolder  = '../Project-HDB-Store/datasets'
        outputFolder = '../Project-HDB-Store/working'
        inputFile = f"{inputFolder}/{filename}"
        #outputFile = filename.replace(".csv", "_new.csv")
        outputFile = f"{outputFolder}/{filename}"
    
        df = pd.read_csv(inputFile, skiprows=skiprows, header=0, nrows=nrows)
        format = self.config[filename]['format']

        if self.config[filename]['transpose']:
            df_t = df.transpose()

            # first row -> columns
            df_t.columns = df_t.iloc[0]
            # drop that header row from data
            df_t = df_t.iloc[1:]

            df_t.columns.name = None
            df_t.index.name = index_name

            # convert index (years, or "YYYY Mon" etc) to datetime
            df_t.index = pd.to_datetime(df_t.index.astype(str), format=format)

            df_t = df_t.sort_index(ascending=True)
        else:
            df_t = df.copy()
            df_t[index_name] = pd.to_datetime(df_t[index_name], format=format)
            df_t = df_t.set_index(index_name)
            df_t = df_t.sort_index(ascending=True)

        # keep a copy before slicing
        self.df_xxx = df_t

        # slice by year range (works for DatetimeIndex)
        df_t = df_t.loc['2000':'2023']

        # save with index
        df_t.to_csv(outputFile, index=True)

        # sample top + bottom
        df_sample = pd.concat([df_t.head(5), df_t.tail(5)])

        self.df_original = df
        self.df = df_t
        self.df_sample = df_sample

        return self.df



### üß© Initialize Class Instance: FileProcessor

In [3]:
fileProcessor = FileProcessor()

### ‚ñ∂Ô∏è Execute File Processor: **Unemployment Rate**

In [4]:
df_unemployment_final = fileProcessor.execute('unemployment.csv', skiprows=0, nrows=15, index_name='year')
print(len(fileProcessor.df))
df_unemployment_final = df_unemployment_final['Total']
df_unemployment_final.head()

24


year
2000-01-01   3.60
2001-01-01   3.70
2002-01-01   4.80
2003-01-01   5.30
2004-01-01   4.50
Name: Total, dtype: object

### ‚ñ∂Ô∏è Execute File Processor: **GDP**

In [5]:
df = fileProcessor.execute('gdp.csv', skiprows=0, index_name='year')
print(len(df))
df_gdp_final = fileProcessor.df
df_gdp_final.head()

24


Unnamed: 0_level_0,gdp_usd
year,Unnamed: 1_level_1
2000-01-01,96076539925.74
2001-01-01,89793790669.65
2002-01-01,92538372869.69
2003-01-01,97646401095.64
2004-01-01,115033593101.05


### ‚ñ∂Ô∏è Execute File Processor: **Inflation**

In [6]:
df = fileProcessor.execute('Inflation.csv', skiprows=0, index_name='year')
print(len(df))
df_inflation_final = fileProcessor.df
df_inflation_final.head()

24


Unnamed: 0_level_0,rate
year,Unnamed: 1_level_1
2000-01-01,1.34
2001-01-01,-0.36
2002-01-01,-1.39
2003-01-01,0.9
2004-01-01,1.15


### ‚ñ∂Ô∏è Execute File Processor: **Births**

In [7]:
df = fileProcessor.execute('Births.csv', nrows=15, skiprows=10, index_name='year_month')
df_birth_final = df['Total Live-Births By Ethnic Group Of Father']
df_birth_final.head()

year_month
2000-01-01    3585
2000-02-01    3636
2000-03-01    3916
2000-04-01    3642
2000-05-01    4004
Name: Total Live-Births By Ethnic Group Of Father, dtype: object

### ‚ñ∂Ô∏è Extra Processing - For **Divorces** and **Marriages**

In [8]:
def convert_yearmonth(df, indexes):
    dfx = df[indexes]

    df_long = dfx.melt(
        id_vars=['year'],                # keep year as identifier
        var_name='month',                # new column for month name
        value_name='value'               # new column for values
    )

    df = df_long
    
    # 1. Extract year only
    df['year_only'] = pd.to_datetime(df['year']).dt.year
    
    # 2. Clean month strings (remove leading/trailing spaces)
    df['month'] = df['month'].str.strip()
    
    # 3. Convert month name ‚Üí month number
    df['month_num'] = pd.to_datetime(df['month'], format="%B").dt.month
    df['month_num'] = df['month_num'].astype(str).str.zfill(2)
    
    # 4. Combine into year-month
    df['year_month'] = df['year_only'].astype(str) + '-' + df['month_num']

    df = copy.copy(df[['year_month', 'value']])
    # Convert to datetime
    df['year_month'] = pd.to_datetime(df['year_month'], format="%Y-%m")
    
    # Sort
    df = df.sort_values('year_month')
    
    # Set index
    df = df.set_index('year_month')
    return df

### ‚ñ∂Ô∏è Extra Processing - For **Divorces - Yearly**

In [9]:
df_divorces_yearly = fileProcessor.execute('Divorces.csv', nrows=39, skiprows=10,index_name='year')
df_divorces_yearly.head()

Unnamed: 0_level_0,Total Divorces,January,February,March,April,May,June,July,August,September,...,March,April,May,June,July,August,September,October,November,December
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-01,4920,396,458,441,448,466,431,359,375,321,...,161,139,162,131,113,146,114,115,130,66
2001-01-01,4819,411,513,389,383,419,352,426,347,424,...,113,110,132,64,103,102,101,151,127,99
2002-01-01,5522,474,499,555,434,489,443,516,458,406,...,102,121,133,103,191,116,122,147,119,102
2003-01-01,6100,539,399,497,410,486,468,612,625,534,...,91,146,174,182,216,182,210,156,134,170
2004-01-01,5850,417,454,458,433,513,501,534,481,478,...,137,156,165,154,127,142,123,135,116,116


### ‚ñ∂Ô∏è Extra Processing - For **Divorces - Yearly**

In [10]:
df = pd.read_csv("../Project-HDB-Store/working/Divorces.csv")
df.head()

Unnamed: 0,year,Total Divorces,January,February,March,April,May,June,July,August,...,March.2,April.2,May.2,June.2,July.2,August.2,September.2,October.2,November.2,December.2
0,2000-01-01,4920,396,458,441,448,466,431,359,375,...,161,139,162,131,113,146,114,115,130,66
1,2001-01-01,4819,411,513,389,383,419,352,426,347,...,113,110,132,64,103,102,101,151,127,99
2,2002-01-01,5522,474,499,555,434,489,443,516,458,...,102,121,133,103,191,116,122,147,119,102
3,2003-01-01,6100,539,399,497,410,486,468,612,625,...,91,146,174,182,216,182,210,156,134,170
4,2004-01-01,5850,417,454,458,433,513,501,534,481,...,137,156,165,154,127,142,123,135,116,116


### ‚ñ∂Ô∏è Extra Processing - For **Divorces - Monthly**

In [11]:
indexes = ['year','  January', '  February', '  March', '  April', '  May', '  June',	'  July', '  August', 
        '  September', '  October', '  November', '  December']
df_divorces_final = convert_yearmonth(df, indexes)
df_divorces_final.head()

Unnamed: 0_level_0,value
year_month,Unnamed: 1_level_1
2000-01-01,396
2000-02-01,458
2000-03-01,441
2000-04-01,448
2000-05-01,466


### ‚ñ∂Ô∏è Extra Processing - For **Marriages**

In [12]:
indexes = ['year','  January', '  February', '  March', '  April', '  May', '  June',	'  July', '  August', 
        '  September', '  October', '  November', '  December']
df = fileProcessor.execute('Marriages.csv', nrows=39, skiprows=10, index_name='year')
df = df.reset_index()
df = copy.copy(df.iloc[:, :14])

df_marriages_final = convert_yearmonth(df, indexes)
df_marriages_final.head()

Unnamed: 0_level_0,value
year_month,Unnamed: 1_level_1
2000-01-01,1602
2000-02-01,1838
2000-03-01,2062
2000-04-01,1268
2000-05-01,1941


### üîó Combine Datasets
- Two files explected
    - Monthly
    - Yearly

### üîó Combine Datasets: Monthly

In [13]:
combined_monthly = pd.concat([df_birth_final, df_marriages_final, df_divorces_final], axis=1)
combined_monthly.columns = ['birth', 'marriages', 'divorces']
combined_monthly.to_csv('../Project-HDB-Store/staging/stat_monthly.csv')

### üîó Combine Datasets: Yearly

In [14]:
combined_yearly = pd.concat([df_unemployment_final, df_inflation_final, df_gdp_final], axis=1)
combined_yearly.columns = ['unemployment', 'inflation', 'gdp']
combined_yearly.to_csv('../Project-HDB-Store/staging/stat_yearly.csv')

<hr />
<hr />
<hr />

In [15]:
import os

folder = "../Project-HDB-Store/Staging"

files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

for file in (files):
    if file[0]==".":
        continue
    print(file)

stat_yearly.csv
stat_monthly.csv
Main_final.csv
Main.csv


<hr />
<hr />
<hr />

In [16]:
#dfx = fileProcessor.execute('ResidentHousehold.csv', nrows=15, skiprows=10, index_name='year')
#dfx_birth_final = df['Total Live-Births By Ethnic Group Of Father']
#dfx_birth_final.head()

In [17]:
df

Unnamed: 0,year,Total Marriages,January,February,March,April,May,June,July,August,September,October,November,December
0,2000-01-01,22561,1602,1838,2062,1268,1941,2389,1657,988,2885,1743,2166,2022
1,2001-01-01,22280,1794,1779,2057,1162,1930,2230,1654,1466,1871,2193,1904,2240
2,2002-01-01,23198,1826,2304,1997,1355,2055,2314,1537,1268,2137,2031,1951,2423
3,2003-01-01,21962,1690,1543,2507,1103,2135,2100,1369,1134,2489,1575,1762,2555
4,2004-01-01,22189,1604,1773,1795,1577,1872,2082,1857,1332,1846,2011,1877,2563
5,2005-01-01,22992,1896,1556,1909,1459,2003,2055,1660,1231,2764,1766,1958,2735
6,2006-01-01,23706,1534,1652,2123,1497,2139,2928,1656,1093,1613,2283,2173,3015
7,2007-01-01,23966,1803,1448,2045,1409,1757,2242,2748,1508,1988,1835,2335,2848
8,2008-01-01,24596,1963,1492,2041,1314,2034,2220,1814,1849,2315,2046,2641,2867
9,2009-01-01,26081,1844,1832,2040,1456,2253,2263,1783,1873,2431,2852,2286,3168
