In [1]:
# ANSELLIM
# 4 NOV 2021

Given HDB dataset combined with lat & long information, as well as coordinate data for various places of interest (taxi stands, schools, mrt, hawker centers, etc.), calculate counts of places of interest within a 1km radius *for each HDB block*.

In [2]:
# Specify radius in kilometers
RADIUS=1.0

In [3]:
from datetime import datetime
import pandas as pd
from geopy.distance import geodesic
import re
import time

In [4]:
timestamp=datetime.now().strftime("%d/%m/%Y %H:%M:%S")
print(timestamp) # GMT

04/11/2021 04:15:46


In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
%cd /content/gdrive/MyDrive/6242_PROJECT/data

/content/gdrive/MyDrive/6242_PROJECT/data


In [7]:
hdb = pd.read_csv("./hdb_aggregated.csv") # USE hdb_aggregated
malls = pd.read_csv("./data_malls.csv")
taxi_stands = pd.read_csv("./taxi_stands.csv")
primary_schools=pd.read_csv("./data_prischools.csv")
mrt = pd.read_csv("./data_MRT.csv")
hawker=pd.read_csv("./data_hawker.csv")
carparks=pd.read_csv("./carparks.csv")
bus_stops = pd.read_csv("./bus_stops.csv")
amenities = pd.read_csv("./amenities.csv")
supermarkets = pd.read_csv("./supermarkets.csv")
secondary_schools=pd.read_csv("./secondary_schools.csv")
eating_establishments=pd.read_csv("./eating_establishments.csv")
parks = pd.read_csv("./parks.csv")

In [8]:
print(hdb.shape[0])

9148


In [9]:
hdb

Unnamed: 0.1,Unnamed: 0,block,floor_area_sqm,resale_price,price_per_sqm,lat,long,isLatLongAvailable
0,0,1 BEACH RD,68.312500,3.424305e+05,5015.688209,1.3036713506088,103.864478660925,True
1,1,1 BEDOK STH AVE 1,60.882353,2.887059e+05,4735.584292,1.32085208990817,103.933721096337,True
2,2,1 CHAI CHEE RD,129.818182,5.250909e+05,4048.298804,1.32796879176302,103.922716018139,True
3,3,1 CHANGI VILLAGE RD,67.625000,2.868750e+05,4243.898204,1.3886100383707,103.988093482829,True
4,4,1 DELTA AVE,119.000000,8.192800e+05,6884.705882,1.2920752508431,103.828584077626,True
...,...,...,...,...,...,...,...,...
9143,9143,99A LOR 2 TOA PAYOH,144.500000,8.667500e+05,6001.842396,1.33878534348177,103.846873055688,True
9144,9144,99B LOR 2 TOA PAYOH,144.333333,9.300000e+05,6444.204981,1.33874496491842,103.84725279468,True
9145,9145,99C LOR 2 TOA PAYOH,148.500000,9.365000e+05,6305.369128,1.33901612961753,103.84744935603,True
9146,9146,9A BOON TIONG RD,101.090909,1.019358e+06,10088.400551,1.2868271524179,103.828659487514,True


In [10]:
hdb=hdb[(hdb["isLatLongAvailable"]==True)]
hdb.reset_index(inplace=True)

In [11]:
print(hdb.shape[0])

9126


In [12]:
# import numpy as np
# SUBSET=2
# np.random.permutation(hdb.shape[0])[:SUBSET]
# hdb=hdb.iloc[np.random.permutation(hdb.shape[0])[:SUBSET],:]
# hdb.reset_index(inplace=True)

In [13]:
print("Number of HDB blocks with lat & long info:",hdb.shape[0])

Number of HDB blocks with lat & long info: 9126


In [14]:
df = hdb.copy(deep=True)
taxi_stands["lat"],taxi_stands["long"] = taxi_stands["Latitude"],taxi_stands["Longitude"]
primary_schools[['long','lat']]=primary_schools['coordinates'].str.split(',',1,expand=True)
mrt[['long','lat']]=mrt['Coordinates'].str.split(',',1,expand=True)
hawker[['long','lat']]=hawker['Coordinates'].str.split(',',1,expand=True)
hawker['lat']=hawker['lat'].str.rstrip(",0.0")
carparks['lat'],carparks['long']=carparks['latitude'],carparks['longitude']
bus_stops['lat'],bus_stops['long']=bus_stops['Latitude'],bus_stops['Longitude']

In [15]:
sports_facility_types = list(amenities.facility_type.unique())
sports_facility_types.remove('CHAS Clinic')
sports_facility_types.remove('Community Centre')
chas_clinics = amenities[amenities['facility_type']=='CHAS Clinic'].reset_index()
sports_facilities = amenities[amenities['facility_type'].isin(sports_facility_types)].reset_index()
community_centers = amenities[amenities['facility_type']=='Community Centre'].reset_index()

In [16]:
places = [malls,taxi_stands,primary_schools,mrt,hawker,carparks,bus_stops,chas_clinics,sports_facilities,community_centers,supermarkets,secondary_schools,eating_establishments,parks]
places_names = ['malls','taxi_stands','primary_schools','mrt','hawker','carparks','bus_stops','chas_clinics','sports_facilities','community_centers','supermarkets','secondary_schools','eating_establishments','parks']

In [17]:
start=time.time()
interm = time.time()
for i in range(len(places)):
  dataframe = places[i]
  dataframe.reset_index(inplace=True)
  place_name = places_names[i]
  print("-------------------------")
  print("Working on feature dataframe {}".format(place_name))
  print("Number of places of interest:",dataframe.shape[0])
  print("Estimated number of pairwise computations:",dataframe.shape[0]*df.shape[0])
  new_column_name = "num_"+place_name
  df[new_column_name] = 0
  for i in range(df.shape[0]):
    lat1 = df.loc[i,"lat"]
    long1 = df.loc[i,"long"]
    origin = (lat1,long1)
    counter = 0
    for j in range(dataframe.shape[0]):
      lat2 = dataframe.loc[j,"lat"]
      long2 = dataframe.loc[j,"long"]
      dest = (lat2,long2)
      try:
        dist = geodesic(origin,dest).km
        if dist<=RADIUS:
          counter+=1
      except ValueError:
        continue
    df.loc[i,new_column_name]=counter
  df.to_csv('./processed/df_hdb.csv') # Checkpointed
  print("Saved checkpoint")
  print("Completed working on feature dataframe {}".format(place_name))
  print("Time taken for this feature dataframe (seconds):",time.time()-interm)
  interm=time.time()
  print("Total time elapsed since start (seconds):",(time.time()-start))
end=time.time()
print("---------------------------------------")
print("END OF ALL TASKS")
print("time taken (seconds): {}".format(end-start))

-------------------------
Working on feature dataframe malls
Number of places of interest: 169
Time elapsed (seconds): 395.63158202171326
-------------------------
Working on feature dataframe taxi_stands
Number of places of interest: 279
Time elapsed (seconds): 1057.5534782409668
-------------------------
Working on feature dataframe primary_schools
Number of places of interest: 186
Time elapsed (seconds): 1500.5730938911438
-------------------------
Working on feature dataframe mrt
Number of places of interest: 189
Time elapsed (seconds): 1961.7290267944336
-------------------------
Working on feature dataframe hawker
Number of places of interest: 119
Time elapsed (seconds): 2257.433905363083
-------------------------
Working on feature dataframe carparks
Number of places of interest: 500
Time elapsed (seconds): 3471.4846992492676
-------------------------
Working on feature dataframe bus_stops
Number of places of interest: 500
Time elapsed (seconds): 4650.53542637825
---------------

In [None]:
df.to_csv('./processed/df_hdb.csv')