In [5]:
from sklearn.neighbors import BallTree
from sklearn.preprocessing import StandardScaler
from bs4 import BeautifulSoup
from datetime import datetime
import geopy.distance
import geopandas as gpd
import pandas as pd
import numpy as np
import requests
import json
import re

from IPython.display import display, HTML


with open('creds.json') as f:
    api_key = json.load(f)

ModuleNotFoundError: No module named 'geopy'

## 1. Getting distance data (Nearest MRT, Nearest Shopping Mall)

Using the coordinates of the HDB flats, we can get the distance to the nearest MRT station and nearest shopping mall. The ball tree algorithm with the haversine metric is used to find k_neighbors closest points to the query point. 

The ball tree algorithm is a data structure for organizing points in a k-dimensional space such that nearest neighbors queries can be performed quickly. The haversine metric is a metric on the surface of a sphere, which is used to calculate the distance between two points on a sphere. The distance (meters) is calculated using the latitude and longitude of the points.

In [None]:
map_abbrv = {
    'AVENUE' : "AVE",
    'STREET' : "ST", 
    'ROAD' : "RD",
    'DRIVE' : "DR",
    'JALAN' : "JLN",
    'PLACE' : "PL",
    "COMMONWEALTH" : "C'WEALTH",
    "BUKIT" : 'BT',
    "CENTRAL" : 'CTRL',
    "NORTH" : 'NTH',
    "LORONG" : 'LOR',
    "GARDENS" : 'GDNS',
    "UPPER" : 'UPP',
    "HEIGHTS" : 'HTS',
    "TANJONG" : 'TG',
    "MARKET" : 'MKT',
    "SOUTH" : "STH",
    "SAINT" : "ST.",
    "PARK" : "PK",
    "KAMPONG" : "KG",
    "CRESCENT" : "CRES",
    "CLOSE" : "CL",
    "TERRACE": "TER"
}
    
manual_addr = {
    "Clarke Quay Central": "6 EU TONG SEN STREET",
    "City Gate Mall": "371 BEACH ROAD",
    "Velocity@Novena Square": "238 THOMSON ROAD",
    "Singpost Centre": "10 EUNOS ROAD 8",
    "Holland Village Shopping Mall": "118 HOLLAND AVENUE",
    "Mustafa Shopping Centre": "145 SYED ALWI ROAD",
    "PoMo": "1 SELEGIE ROAD", # Reopened as GR.iD Singapore
    "Shaw House and Centre": "1 SCOTTS ROAD", # Shaw House and Center next to each other
    "KINEX (formerly OneKM)": "11 TANJONG KATONG ROAD",
    "Paya Lebar Quarter (PLQ)": "10 PAYA LEBAR ROAD",
    "OD Mall": "200 TURF CLUB ROAD"
}

manual_remove = ["Tekka Centre"] # Wet Market

In [3]:
def get_full_addr(map_abbrv, street_name):
    street_ls = street_name.split(' ')
    rev_subs = {v:k for k, v in map_abbrv.items()}
    return ' '.join([rev_subs.get(item,item) for item in street_ls])

def get_addr_coords(addr):
    url = "https://developers.onemap.sg/commonapi/search"
    if addr in manual_addr:
        addr = manual_addr[addr]
    
    params = {
        "searchVal":addr,
        "returnGeom":"Y",
        "getAddrDetails":"Y"
    }
    
    results = requests.get(url, params=params).json()
    if results['found'] == 0:
        print(f"Results not found for {addr}")
        return None, None
    else:
        result_df = pd.DataFrame(results['results'])
        row = result_df.loc[result_df['ADDRESS'].str.contains(addr, case=False)]
        if len(row.index) == 0:
            print(f"Location {addr} not in this area, using first available")
            return result_df.loc[0, "LATITUDE"], result_df.loc[0, "LONGITUDE"]
        else: 
            return result_df.loc[row.index[0], "LATITUDE"], result_df.loc[row.index[0], "LONGITUDE"]

'''
def get_distance_between_coords(origin_coords, dest_coords):
    url = "https://developers.onemap.sg/privateapi/routingsvc/route"
    params={
        "start":f"{origin_coords[0]},{origin_coords[1]}",
        "end":f"{dest_coords[0]},{dest_coords[1]}",
        "routeType":"walk",
        "token": api_key["ONE_MAP_API_KEY"]
    }
    
    results = requests.get(url, params=params).json()["route_summary"]
    
    if results["status"] == 0:
        return results["total_time"], results["total_distance"]
    else:
        return None, None
'''

def get_nearest(src_points, candidates, k_neighbors=1):
    ''' Find nearest neighbors for all source points from a set of candidate points '''
    # Create tree from the candidate points
    tree = BallTree(candidates, leaf_size=10, metric='haversine')# Find closest points and distances
    distances, indices = tree.query(src_points, k=k_neighbors) # Transpose to get distances and indices into arrays
    distances = distances.transpose()
    indices = indices.transpose()# Get closest indices and distances (i.e. array at index 0)
    # note: for the second closest points, you would take index 1, etc.
    closest = indices[0]
    closest_dist = distances[0] # Return indices and distances
    return (closest, closest_dist)

def nearest_neighbor(left_gdf, right_gdf, return_dist=True):
    """
    For each point in left_gdf, find closest point in right GeoDataFrame and return them.
    NOTICE: Assumes that the input Points are in WGS84 projection (lat/lon).
    """
    left_geom_col, right_geom_col = left_gdf.geometry.name, right_gdf.geometry.name
    
    # Ensure that index in right gdf is formed of sequential numbers
    right = right_gdf.copy().reset_index(drop=True)
    
    # Parse coordinates from points and insert them into a numpy array as RADIANS
    left_radians = np.array(left_gdf[left_geom_col]\
                            .apply(lambda geom: (geom.y * np.pi / 180, geom.x * np.pi / 180)).to_list())
    right_radians = np.array(right_gdf[right_geom_col]\
                             .apply(lambda geom: (geom.y * np.pi / 180, geom.x * np.pi / 180)).to_list())
    
    # Find the nearest points
    # -----------------------
    # closest ==> index in right_gdf that corresponds to the closest point
    # dist ==> distance between the nearest neighbors (in meters)
    closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)
    
    # Return points from right GeoDataFrame that are closest to points in left GeoDataFrame
    closest_points = right.loc[closest]
    
    # Ensure that the index corresponds the one in left_gdf
    closest_points = closest_points.reset_index(drop=True)
    
    # Add distance if requested 
    if return_dist:
        # Convert to meters from radians
        earth_radius = 6371000  # meters
        closest_points['distance'] = dist * earth_radius
                
    return closest_points

 


### Clean HDB Data

In [4]:
hdb_df = pd.read_csv("data/cleaned_final_data.csv") # OR USE searchData/limitData
hdb_df = hdb_df.drop_duplicates()
# hdb_df = hdb_df.sample(frac=0.0005, random_state=1) # GETS SMALL SMAPLE OF ROWS
hdb_df = hdb_df.reset_index()
print(hdb_df.dtypes)
print("Length:", len(hdb_df))
hdb_df.head(2)
hdb_df = hdb_df[hdb_df.isnull().any(axis=1)]

NameError: name 'pd' is not defined

In [53]:
# Split month column to year and month
hdb_df[['year', 'month']] = hdb_df.month.str.split("-", expand = True)
hdb_df['year'] = hdb_df['year'].astype(int)
hdb_df['month'] = hdb_df['month'].astype(int)

# Convert remaining lease from years to months
hdb_df['month_remaining_lease'] = hdb_df['remaining_lease'].apply(lambda x: int(x.split(' ')[0]) * 12 + int(x.split(' ')[2]) if len(x.split(' ')) > 2 else int(x.split(' ')[0]) * 12)

# Split storey range into min and max
hdb_df['storey_range_low'] = hdb_df['storey_range'].apply(lambda x: int(x.split('TO')[0]))
hdb_df['storey_range_high'] = hdb_df['storey_range'].apply(lambda x: int(x.split('TO')[1]))
 
# Convert categorical columns to numerical
hdb_df['flat_model'] = hdb_df['flat_model'].astype('category').cat.codes
hdb_df['street_name'] = hdb_df['street_name'].apply(lambda x: get_full_addr(map_abbrv, x))

'''
# Add lat lon columns
hdb_df['lat'] = 0.0
hdb_df['lon'] = 0.0

# Add old data if it exist
try:
    old_hdb_db = pd.read_csv("data/clean_hdb_df.csv")
    print(F"Old data found! Rows: {str(len(old_hdb_db.index))}")
    if len(old_hdb_db.index) > 0:
        hdb_df = pd.concat([old_hdb_db, hdb_df])
        hdb_df = hdb_df.drop_duplicates()
        hdb_df = hdb_df.reset_index()
        del old_hdb_db
    print("Added old data SUCCESSFULLY!")
except FileNotFoundError:
    print("File not available")
'''

'\n# Add lat lon columns\nhdb_df[\'lat\'] = 0.0\nhdb_df[\'lon\'] = 0.0\n\n# Add old data if it exist\ntry:\n    old_hdb_db = pd.read_csv("data/clean_hdb_df.csv")\n    print(F"Old data found! Rows: {str(len(old_hdb_db.index))}")\n    if len(old_hdb_db.index) > 0:\n        hdb_df = pd.concat([old_hdb_db, hdb_df])\n        hdb_df = hdb_df.drop_duplicates()\n        hdb_df = hdb_df.reset_index()\n        del old_hdb_db\n    print("Added old data SUCCESSFULLY!")\nexcept FileNotFoundError:\n    print("File not available")\n'

In [54]:
# Get coords of each flat
for idx, row in hdb_df.iterrows():
    lat, lon = get_addr_coords(f"{row['block']} {row['street_name']}")
    hdb_df.loc[idx, 'lat'] = lat
    hdb_df.loc[idx, 'lon'] = lon
    
    if (idx+1) % 500 == 0:
        print(f"Processing {idx+1}/{len(hdb_df)}")    
        hdb_df.loc[:idx].to_csv("data/clean_hdb_df_2015.csv", index=False)

Processing 500/147831
Processing 1000/147831
Location 216 CHOA CHU KANG CENTRAL not in this area, using first available
Processing 1500/147831
Processing 2000/147831
Processing 2500/147831
Processing 3000/147831
Processing 3500/147831
Processing 4000/147831
Processing 4500/147831
Processing 5000/147831
Processing 5500/147831
Processing 6000/147831
Processing 6500/147831
Processing 7000/147831
Processing 7500/147831
Processing 8000/147831
Processing 8500/147831
Processing 9000/147831
Processing 9500/147831
Processing 10000/147831
Processing 10500/147831
Processing 11000/147831
Processing 11500/147831
Processing 12000/147831
Processing 12500/147831
Processing 13000/147831
Processing 13500/147831
Processing 14000/147831
Processing 14500/147831
Processing 15000/147831
Processing 15500/147831
Processing 16000/147831
Processing 16500/147831
Processing 17000/147831
Processing 17500/147831
Processing 18000/147831
Processing 18500/147831
Processing 19000/147831
Processing 19500/147831
Processin

In [55]:
hdb_df.head(5)

Unnamed: 0,index,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,year,month_remaining_lease,storey_range_low,storey_range_high,lat,lon
0,0,1,ANG MO KIO,2 ROOM,406,ANG MO KIO AVENUE 10,10 TO 12,44.0,5,1979,61 years 04 months,232000.0,2017,736,10,12,1.36200453938712,103.853879910407
1,1,1,ANG MO KIO,3 ROOM,108,ANG MO KIO AVENUE 4,01 TO 03,67.0,12,1978,60 years 07 months,250000.0,2017,727,1,3,1.37094273993861,103.837974822369
2,2,1,ANG MO KIO,3 ROOM,602,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,62 years 05 months,262000.0,2017,749,1,3,1.38070883044887,103.835368226602
3,3,1,ANG MO KIO,3 ROOM,465,ANG MO KIO AVENUE 10,04 TO 06,68.0,12,1980,62 years 01 month,265000.0,2017,745,4,6,1.3662010408294,103.857200967235
4,4,1,ANG MO KIO,3 ROOM,601,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,62 years 05 months,265000.0,2017,749,1,3,1.38104134784496,103.835131744823


### Clean MRT Data

In [56]:
mrt_stations = gpd.read_file("data/TrainStation_Feb2023/RapidTransitSystemStation.shp")
mrt_stations["geometry"] = mrt_stations["geometry"].centroid
mrt_stations = mrt_stations.to_crs(epsg=4326)
mrt_stations['lat'] = mrt_stations.geometry.y
mrt_stations['lon'] = mrt_stations.geometry.x
mrt_stations = mrt_stations[['STN_NAM_DE', 'lat', 'lon']]
mrt_stations.to_csv("data/mrt_stations_coords_2015.csv", index=False)
mrt_stations.head(5)

Unnamed: 0,STN_NAM_DE,lat,lon
0,ESPLANADE MRT STATION,1.29326,103.855612
1,PAYA LEBAR MRT STATION,1.317369,103.892272
2,DHOBY GHAUT MRT STATION,1.299044,103.845833
3,DAKOTA MRT STATION,1.308375,103.888668
4,LAVENDER MRT STATION,1.307372,103.862838


### Clean Shopping Mall Data

In [57]:
url = "https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore"
response = requests.get(url, timeout = 5)
content = BeautifulSoup(response.content, "html.parser")

results = [block.find_all("li") for block in content.find_all("div", class_="div-col")]
results = [item.text for i in results for item in i]
results = [re.sub(r"\[.*\]", "", item) for item in results]
results = [item for item in results if item not in manual_remove]

mall_df = pd.DataFrame(results, columns=["mall_name"])
mall_df[['lat', 'lon']] = mall_df.apply(lambda x: pd.Series(get_addr_coords(x['mall_name'])), axis=1)
mall_df.to_csv("data/mall_coords_2015.csv", index=False)
mall_df.head(5)

Unnamed: 0,mall_name,lat,lon
0,100 AM,1.27468281482263,103.843488359469
1,313@Somerset,1.30100656917241,103.838246592796
2,Aperia,1.3104736675734,103.86431321816
3,Balestier Hill Shopping Centre,1.32559594839311,103.842571612968
4,Bugis Cube,1.2981408343975,103.855635339249


### Calculate and combine distance data

In [58]:
# Convert all hdb data to geodataframe
block_locations = gpd.GeoDataFrame(hdb_df.copy(), geometry=gpd.points_from_xy(hdb_df.lon, hdb_df.lat), crs="EPSG:4326")
block_locations = block_locations.reset_index().rename(columns={"index":"id"})

In [59]:
# Convert all mrt data to geodataframe
mrt_stations_final = gpd.GeoDataFrame(mrt_stations.copy(), geometry=gpd.points_from_xy(mrt_stations.lon, mrt_stations.lat))
nn_mrt = nearest_neighbor(block_locations, mrt_stations_final)
nn_mrt['distance'] = nn_mrt['distance'].apply(lambda x: x / 1000)
nn_mrt = nn_mrt[["STN_NAM_DE", "distance"]].rename(columns={"STN_NAM_DE": 'nearest_mrt', 'distance': 'min_dist_mrt'})

In [60]:
# Convert all mall data to geodataframe
mall_final = gpd.GeoDataFrame(mall_df.copy(), geometry=gpd.points_from_xy(mall_df.lon, mall_df.lat))
nn_mall = nearest_neighbor(block_locations, mall_final)
nn_mall['distance'] = nn_mall['distance'].apply(lambda x: x / 1000)
nn_mall = nn_mall[["mall_name", "distance"]].rename(columns={"mall_name": 'nearest_mall', 'distance': 'min_dist_mall'})

In [61]:
final_df = pd.concat([block_locations, nn_mrt, nn_mall], axis=1)
final_df.drop(columns=["id", "geometry", "lat", "lon"], inplace=True)
final_df.head(5)

Unnamed: 0,level_0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price,year,month_remaining_lease,storey_range_low,storey_range_high,nearest_mrt,min_dist_mrt,nearest_mall,min_dist_mall
0,0,1,ANG MO KIO,2 ROOM,406,ANG MO KIO AVENUE 10,10 TO 12,44.0,5,1979,61 years 04 months,232000.0,2017,736,10,12,BISHAN DEPOT,0.926375,AMK Hub,1.00161
1,1,1,ANG MO KIO,3 ROOM,108,ANG MO KIO AVENUE 4,01 TO 03,67.0,12,1978,60 years 07 months,250000.0,2017,727,1,3,MAYFLOWER MRT STATION,0.184322,Broadway Plaza,0.893355
2,2,1,ANG MO KIO,3 ROOM,602,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,62 years 05 months,262000.0,2017,749,1,3,LENTOR MRT STATION,0.48196,Broadway Plaza,1.52802
3,3,1,ANG MO KIO,3 ROOM,465,ANG MO KIO AVENUE 10,04 TO 06,68.0,12,1980,62 years 01 month,265000.0,2017,745,4,6,ANG MO KIO MRT STATION,0.94049,myVillage At Serangoon Garden,0.892903
4,4,1,ANG MO KIO,3 ROOM,601,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,62 years 05 months,265000.0,2017,749,1,3,LENTOR MRT STATION,0.453153,Broadway Plaza,1.5719


## 2. Get Distance from HDB to Town (City Hall Mrt)

In [62]:
def get_distance(left_series, right_series):
    left_lat=float(left_series["lat"])
    left_lon=float(left_series["lon"])
    right_lat=float(right_series["lat"])
    right_lon=float(right_series["lon"])
    return geopy.distance.distance((left_lat, left_lon), (right_lat,right_lon)).km

In [63]:
city_hall_mrt_series = mrt_stations.loc[mrt_stations["STN_NAM_DE"]=="CITY HALL MRT STATION"].squeeze()
final_df["min_dist_town_in_km"] = hdb_df.apply(lambda row: get_distance(row, city_hall_mrt_series), axis=1)
final_df.head(5)

Unnamed: 0,level_0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,...,resale_price,year,month_remaining_lease,storey_range_low,storey_range_high,nearest_mrt,min_dist_mrt,nearest_mall,min_dist_mall,min_dist_town_in_km
0,0,1,ANG MO KIO,2 ROOM,406,ANG MO KIO AVENUE 10,10 TO 12,44.0,5,1979,...,232000.0,2017,736,10,12,BISHAN DEPOT,0.926375,AMK Hub,1.00161,7.638861
1,1,1,ANG MO KIO,3 ROOM,108,ANG MO KIO AVENUE 4,01 TO 03,67.0,12,1978,...,250000.0,2017,727,1,3,MAYFLOWER MRT STATION,0.184322,Broadway Plaza,0.893355,8.778283
2,2,1,ANG MO KIO,3 ROOM,602,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,...,262000.0,2017,749,1,3,LENTOR MRT STATION,0.48196,Broadway Plaza,1.52802,9.893601
3,3,1,ANG MO KIO,3 ROOM,465,ANG MO KIO AVENUE 10,04 TO 06,68.0,12,1980,...,265000.0,2017,745,4,6,ANG MO KIO MRT STATION,0.94049,myVillage At Serangoon Garden,0.892903,8.117683
4,4,1,ANG MO KIO,3 ROOM,601,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,...,265000.0,2017,749,1,3,LENTOR MRT STATION,0.453153,Broadway Plaza,1.5719,9.934792


## 3. Get population size in Singapore for that year

In [64]:
population_df = pd.read_excel('data/sg_population_size_from_singstat.gov.sg.xlsx')  
population_df = population_df[["year","total_population"]]

#Convert to 2D array for standardization
population_arr = np.asarray(population_df["total_population"])
population_arr = population_arr.reshape(-1,1)

#standardization (Z-score normalization)
scaling=StandardScaler()
population_arr =scaling.fit_transform(population_arr)

#Set array values into dataframe
population_df["total_population"] = pd.DataFrame(population_arr, columns = ['total_population'])

#merge to dataframe based on common attribute: year
final_df = pd.merge(final_df, population_df, how='left')

final_df.head(5)

Unnamed: 0,level_0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,...,year,month_remaining_lease,storey_range_low,storey_range_high,nearest_mrt,min_dist_mrt,nearest_mall,min_dist_mall,min_dist_town_in_km,total_population
0,0,1,ANG MO KIO,2 ROOM,406,ANG MO KIO AVENUE 10,10 TO 12,44.0,5,1979,...,2017,736,10,12,BISHAN DEPOT,0.926375,AMK Hub,1.00161,7.638861,1.653498
1,1,1,ANG MO KIO,3 ROOM,108,ANG MO KIO AVENUE 4,01 TO 03,67.0,12,1978,...,2017,727,1,3,MAYFLOWER MRT STATION,0.184322,Broadway Plaza,0.893355,8.778283,1.653498
2,2,1,ANG MO KIO,3 ROOM,602,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,...,2017,749,1,3,LENTOR MRT STATION,0.48196,Broadway Plaza,1.52802,9.893601,1.653498
3,3,1,ANG MO KIO,3 ROOM,465,ANG MO KIO AVENUE 10,04 TO 06,68.0,12,1980,...,2017,745,4,6,ANG MO KIO MRT STATION,0.94049,myVillage At Serangoon Garden,0.892903,8.117683,1.653498
4,4,1,ANG MO KIO,3 ROOM,601,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,...,2017,749,1,3,LENTOR MRT STATION,0.453153,Broadway Plaza,1.5719,9.934792,1.653498


## 4. Get Cost of living in Singapore by month and year

The Consumer Price Index (CPI) is designed to measure the average price changes in a fixed basket of consumption goods and services commonly purchased by the resident households over time. It is widely used as a measure of the consumer price inflation.

The base year is the period with which all the other periods are compared. The base year for the current Consumer Price Index (CPI) is 2019.

Data Source: https://data.gov.sg/api/action/datastore_search?resource_id=e7485f4b-eb19-45f2-a853-ec0e97ac1939

In [65]:
months_to_integer = {'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6, 'Jul' : 7, 'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12}

In [66]:
df_cpi = pd.read_csv('data/cpi_data.csv')
df_cpi = df_cpi.transpose()
df_cpi = df_cpi.reset_index().iloc[1:, [1,2]]

# Split date into year and month
df_cpi.columns = ['date', 'cpi']
df_cpi[['year', 'month']] = df_cpi['date'].apply(lambda x: pd.Series(str(x).split(" ")[:2]))
df_cpi.drop('date', axis=1, inplace=True)
df_cpi['month'] = df_cpi['month'].apply(lambda x: months_to_integer[x])

# Convert to correct data type
df_cpi['year'] = df_cpi['year'].astype(int)

df_cpi['month'] = df_cpi['month'].astype(int)
df_cpi['cpi'] = df_cpi['cpi'].astype(float)

# Change the month to integer

df_cpi.head(5)

Unnamed: 0,cpi,year,month
1,112.019,2023,2
2,111.397,2023,1
3,111.186,2022,12
4,110.959,2022,11
5,109.893,2022,10


In [67]:
# Combine CPI column to base df
final_df = pd.merge(final_df, df_cpi, on=['year', 'month'], how='left')
final_df.head(5)

Unnamed: 0,level_0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,...,month_remaining_lease,storey_range_low,storey_range_high,nearest_mrt,min_dist_mrt,nearest_mall,min_dist_mall,min_dist_town_in_km,total_population,cpi
0,0,1,ANG MO KIO,2 ROOM,406,ANG MO KIO AVENUE 10,10 TO 12,44.0,5,1979,...,736,10,12,BISHAN DEPOT,0.926375,AMK Hub,1.00161,7.638861,1.653498,99.026
1,1,1,ANG MO KIO,3 ROOM,108,ANG MO KIO AVENUE 4,01 TO 03,67.0,12,1978,...,727,1,3,MAYFLOWER MRT STATION,0.184322,Broadway Plaza,0.893355,8.778283,1.653498,99.026
2,2,1,ANG MO KIO,3 ROOM,602,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,...,749,1,3,LENTOR MRT STATION,0.48196,Broadway Plaza,1.52802,9.893601,1.653498,99.026
3,3,1,ANG MO KIO,3 ROOM,465,ANG MO KIO AVENUE 10,04 TO 06,68.0,12,1980,...,745,4,6,ANG MO KIO MRT STATION,0.94049,myVillage At Serangoon Garden,0.892903,8.117683,1.653498,99.026
4,4,1,ANG MO KIO,3 ROOM,601,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,...,749,1,3,LENTOR MRT STATION,0.453153,Broadway Plaza,1.5719,9.934792,1.653498,99.026


## 5. Get PSI reading of the area for month and year

In [68]:
api = "https://api.data.gov.sg/v1/environment/psi"
months_to_integer = {'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}

with open("data/sorted_towns.json") as f:
    towns = json.load(f)["town_region"]
    town_dict = { item["region"]: item["towns"] for item in towns}

In [69]:
def find_region(town_dict, town):
    for region, towns in town_dict.items():
        if town.lower() in towns:
            return region
    print("Region not found for town: ", town)
    return None

# Add region column
final_df['region'] = final_df['town'].apply(lambda x: find_region(town_dict, x.lower()))

In [70]:
psi_df = pd.DataFrame()
# Extract only the 'psi_twenty_four_hourly' data from the readings attribute for every month at 0000hrs in the YYYY-MM-DD[T]HH:mm:ss (SGT) format from 2017 to 2023
for year in range(2017, 2024):
    # For every month only get the first day of the month
    for month in range(1, 13):
        # Break out of the loop if the year is 2023 and the month is April
        if month == 4 and year == 2023:
            break
        # Get the first day of the month
        date = datetime(year, month, 1)
        # Get the timestamp in the format YYYY-MM-DD[T]HH:mm:ss (SGT)
        timestamp = date.strftime('%Y-%m-%dT%H:%M:%S')
        # Make the request
        response = requests.get(api, params={'date_time': timestamp})
        # Convert the response to a JSON object
        data = response.json()
        # Extract the readings attribute
        readings = data['items'][0]['readings']
        # Convert the readings attribute to a DataFrame
        temp_df = pd.DataFrame(readings)
        # Extract only the 'psi_twenty_four_hourly' data for each area and rename column to 'psi'
        temp_df = temp_df[['psi_twenty_four_hourly']].rename(columns={'psi_twenty_four_hourly': 'psi'})
        # Create column for area
        temp_df['region'] = temp_df.index
        # Create column and convert month to integer using months_to_integer dictionary
        temp_df['month'] = int(months_to_integer[date.strftime('%B')])
        # Create column for Year as integer
        temp_df['year'] = int(date.strftime('%Y'))
        # Append the data to the results DataFrame
        psi_df = pd.concat([psi_df, temp_df], axis=0)

# Save the data to a file in 'data' directory as csv
psi_df.to_csv('data/psi_data_2015.csv', index=False)

In [71]:
psi_df.head(5)

Unnamed: 0,psi,region,month,year
national,39,national,1,2017
south,33,south,1,2017
north,36,north,1,2017
east,39,east,1,2017
central,29,central,1,2017


In [72]:
# Combine PSI column to base df
final_df = pd.merge(final_df, psi_df, on=['year', 'month', 'region'], how='left')
final_df.head(5)

Unnamed: 0,level_0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,...,storey_range_high,nearest_mrt,min_dist_mrt,nearest_mall,min_dist_mall,min_dist_town_in_km,total_population,cpi,region,psi
0,0,1,ANG MO KIO,2 ROOM,406,ANG MO KIO AVENUE 10,10 TO 12,44.0,5,1979,...,12,BISHAN DEPOT,0.926375,AMK Hub,1.00161,7.638861,1.653498,99.026,central,29
1,1,1,ANG MO KIO,3 ROOM,108,ANG MO KIO AVENUE 4,01 TO 03,67.0,12,1978,...,3,MAYFLOWER MRT STATION,0.184322,Broadway Plaza,0.893355,8.778283,1.653498,99.026,central,29
2,2,1,ANG MO KIO,3 ROOM,602,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,...,3,LENTOR MRT STATION,0.48196,Broadway Plaza,1.52802,9.893601,1.653498,99.026,central,29
3,3,1,ANG MO KIO,3 ROOM,465,ANG MO KIO AVENUE 10,04 TO 06,68.0,12,1980,...,6,ANG MO KIO MRT STATION,0.94049,myVillage At Serangoon Garden,0.892903,8.117683,1.653498,99.026,central,29
4,4,1,ANG MO KIO,3 ROOM,601,ANG MO KIO AVENUE 5,01 TO 03,67.0,12,1980,...,3,LENTOR MRT STATION,0.453153,Broadway Plaza,1.5719,9.934792,1.653498,99.026,central,29


## FINAL DATA FORMAT

In [73]:
final_df = final_df.drop_duplicates()
final_df = final_df[['year', 'month', 'region', 'town', 'block', 'street_name', 'lease_commence_date',
       'month_remaining_lease', 'flat_type', 'flat_model', 'floor_area_sqm',
       'storey_range_low', 'storey_range_high', 'nearest_mrt',
       'min_dist_mrt', 'nearest_mall', 'min_dist_mall', 'min_dist_town_in_km', 'total_population', 'cpi', 'psi', 'resale_price']]
final_df.to_csv('data/cleaned_final_data_2015.csv', index=False)
final_df.head(5)

Unnamed: 0,year,month,region,town,block,street_name,lease_commence_date,month_remaining_lease,flat_type,flat_model,...,storey_range_high,nearest_mrt,min_dist_mrt,nearest_mall,min_dist_mall,min_dist_town_in_km,total_population,cpi,psi,resale_price
0,2017,1,central,ANG MO KIO,406,ANG MO KIO AVENUE 10,1979,736,2 ROOM,5,...,12,BISHAN DEPOT,0.926375,AMK Hub,1.00161,7.638861,1.653498,99.026,29,232000.0
1,2017,1,central,ANG MO KIO,108,ANG MO KIO AVENUE 4,1978,727,3 ROOM,12,...,3,MAYFLOWER MRT STATION,0.184322,Broadway Plaza,0.893355,8.778283,1.653498,99.026,29,250000.0
2,2017,1,central,ANG MO KIO,602,ANG MO KIO AVENUE 5,1980,749,3 ROOM,12,...,3,LENTOR MRT STATION,0.48196,Broadway Plaza,1.52802,9.893601,1.653498,99.026,29,262000.0
3,2017,1,central,ANG MO KIO,465,ANG MO KIO AVENUE 10,1980,745,3 ROOM,12,...,6,ANG MO KIO MRT STATION,0.94049,myVillage At Serangoon Garden,0.892903,8.117683,1.653498,99.026,29,265000.0
4,2017,1,central,ANG MO KIO,601,ANG MO KIO AVENUE 5,1980,749,3 ROOM,12,...,3,LENTOR MRT STATION,0.453153,Broadway Plaza,1.5719,9.934792,1.653498,99.026,29,265000.0
