# Preparing new data's geographical values

We got the new 'April HDB.csv' file from data.gov.sg to test our model

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

import re

In [2]:
df = pd.read_csv('April HDB.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 931 entries, 0 to 930
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   month                931 non-null    object 
 1   town                 931 non-null    object 
 2   flat_type            931 non-null    object 
 3   block                931 non-null    object 
 4   street_name          931 non-null    object 
 5   storey_range         931 non-null    object 
 6   floor_area_sqm       931 non-null    float64
 7   flat_model           931 non-null    object 
 8   lease_commence_date  931 non-null    int64  
 9   remaining_lease      931 non-null    int64  
 10  resale_price         931 non-null    float64
dtypes: float64(2), int64(2), object(7)
memory usage: 80.1+ KB


In [7]:
df['Latitude'] = np.nan

# Extracting Lat & Long data

In [8]:
%%time
import json
import requests
import urllib.parse
import time

for i in range(df.shape[0]):
    if (df['Latitude'].isnull().iloc[i]):
        address = df['street_name'][i]+" "+df['block'][i]
        query_address = urllib.parse.quote(address)
        query_string = 'https://developers.onemap.sg/commonapi/search?searchVal='+str(query_address)+'&returnGeom=Y&getAddrDetails=Y&pageNum=1'
        resp = requests.get(query_string)

        #Convert JSON into Python Object 
        data=json.loads(resp.content)

        if(data['found'] == 0): 
            print("no data in row:", i)
        else:
            df.loc[i, 'Longitude'] = data['results'][-1]['LONGITUDE']
            df.loc[i, 'Latitude'] = data['results'][-1]['LATITUDE']
            df.loc[i, 'Address'] = data['results'][-1]['ADDRESS']
            df.loc[i, 'Postal code'] = data['results'][-1]['POSTAL']

        if((i+1)%250==0):
            print("Sleeping...", i)
            time.sleep(15)
    else: continue

Sleeping... 249
Sleeping... 499
Sleeping... 749
CPU times: user 16.5 s, sys: 1.35 s, total: 17.8 s
Wall time: 3min 17s


# Extracting malls data

In [9]:
malls_df = pd.read_csv('malls_data.csv')

In [10]:
%%time
from geopy.distance import geodesic

# Function to find nearest mall and distance to nearest mall
def find_nearest_mall(lat, long):
    distances = malls_df.apply(lambda row: geodesic((lat, long), (row['Latitude'], row['Longitude'])).km, axis=1)
    nearest_mall_distance = min(distances)
    nearest_mall_index = distances.idxmin()
    nearest_mall = malls_df.loc[nearest_mall_index, 'Mall']
    return pd.Series({'nearest mall': nearest_mall, 'mall nearest distance': nearest_mall_distance})

# Function to find number of malls within a certain radius
def count_malls_within_radius(lat, long, radius):
    distances = malls_df.apply(lambda row: geodesic((lat, long), (row['Latitude'], row['Longitude'])).km, axis=1)
    malls_within_radius = (distances <= radius).sum()
    return malls_within_radius

# Apply the functions to create the new columns
df[['nearest mall', 'mall nearest distance']] = df.apply(lambda row: find_nearest_mall(row['Latitude'], row['Longitude']), axis=1)
df['mall within 500m'] = df.apply(lambda row: count_malls_within_radius(row['Latitude'], row['Longitude'], 0.5), axis=1)
df['mall within 1km'] = df.apply(lambda row: count_malls_within_radius(row['Latitude'], row['Longitude'], 1), axis=1)


CPU times: user 2min 10s, sys: 282 ms, total: 2min 10s
Wall time: 2min 11s


# Extracting hawkers data

In [11]:
df_hawkers = pd.read_csv('hawkers_data.csv')

In [12]:
%%time
from geopy.distance import geodesic

# Function to find number of hawkers within a certain radius
def count_hawkers_within_radius(lat, long, radius):
    distances = df_hawkers.apply(lambda row: geodesic((lat, long), (row['Latitude'], row['Longitude'])).km, axis=1)
    return (distances <= radius).sum()

# Apply the function to create the new column
df['hawkers within 1km'] = df.apply(lambda row: count_hawkers_within_radius(row['Latitude'], row['Longitude'], 1), axis=1)

CPU times: user 32.2 s, sys: 53.8 ms, total: 32.3 s
Wall time: 32.3 s


# Extracting schools data

In [13]:
df_schools = pd.read_csv('schools_data.csv')

In [14]:
%%time
from geopy.distance import geodesic

# Create a list of tuples containing the school coordinates
school_coords = [(row['Latitude'], row['Longitude']) for index, row in df_schools.iterrows()]

# Function to find number of schools within a certain radius
def count_schools_within_radius(lat, long, radius):
    distances = [geodesic((lat, long), coord).km for coord in school_coords]
    schools_within_radius = sum(distance <= radius for distance in distances)
    return schools_within_radius

# Apply the function to create the new column
df['schools within 1km'] = [count_schools_within_radius(row['Latitude'], row['Longitude'], 1) for index, row in df.iterrows()]

CPU times: user 1min 18s, sys: 121 ms, total: 1min 18s
Wall time: 1min 18s


# Extracting MRT data

In [15]:
mrt_df = pd.read_csv('mrt_lrt_data.csv')

In [16]:
from geopy.distance import geodesic

# Define a new function to be applied for faster computation
def find_nearest_mrt(row):
    min_distance = float('inf')
    nearest_mrt = ''
    
    for idx, mrt in mrt_df.iterrows():
        distance = geodesic((row['Latitude'], row['Longitude']), (mrt['lat'], mrt['lng'])).km
        
        if distance < min_distance:
            min_distance = distance
            nearest_mrt = mrt['station_name']
    
    return pd.Series({'nearest MRT': nearest_mrt, 'distance to nearest MRT': min_distance})

df[['nearest MRT', 'distance to nearest MRT']] = df.apply(find_nearest_mrt, axis=1)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 931 entries, 0 to 930
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   month                    931 non-null    object 
 1   town                     931 non-null    object 
 2   flat_type                931 non-null    object 
 3   block                    931 non-null    object 
 4   street_name              931 non-null    object 
 5   storey_range             931 non-null    object 
 6   floor_area_sqm           931 non-null    float64
 7   flat_model               931 non-null    object 
 8   lease_commence_date      931 non-null    int64  
 9   remaining_lease          931 non-null    int64  
 10  resale_price             931 non-null    float64
 11  Latitude                 931 non-null    object 
 12  Longitude                931 non-null    object 
 13  Address                  931 non-null    object 
 14  Postal code              9

In [19]:
df['LatLong'] = df['Latitude'].astype(str) + ',' + df['Longitude'].astype(str)

In [21]:
counts_df = pd.read_csv('counts_df.csv')

In [22]:
counts_df = counts_df.drop(columns=['Count', 'nearest MRT'])
df = pd.merge(df, counts_df, on='LatLong', how='left')
df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,nearest mall,mall nearest distance,mall within 500m,mall within 1km,hawkers within 1km,schools within 1km,nearest MRT,distance to nearest MRT,LatLong,travel time to RP in minutes
0,2023-04,ANG MO KIO,3 ROOM,220,ANG MO KIO AVE 1,01 TO 03,82.0,New Generation,1977,53,...,Broadway Plaza,0.926337,0,2,5,3,Ang Mo Kio,1.122197,"1.36558833593063,103.840518883254",53.0
1,2023-04,ANG MO KIO,3 ROOM,542,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1981,56,...,Jubilee Square,0.952383,0,1,2,1,Ang Mo Kio,0.872607,"1.37473841331656,103.855876768354",44.0
2,2023-04,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1981,56,...,Jubilee Square,0.897888,0,2,2,2,Ang Mo Kio,0.797843,"1.37392238703482,103.855621370524",42.0
3,2023-04,ANG MO KIO,3 ROOM,405,ANG MO KIO AVE 10,01 TO 03,82.0,New Generation,1979,55,...,AMK Hub,1.030574,0,0,3,3,Ang Mo Kio,1.045207,"1.36157912717139,103.853805386957",40.0
4,2023-04,ANG MO KIO,3 ROOM,456,ANG MO KIO AVE 10,04 TO 06,89.0,New Generation,1980,55,...,myVillage At Serangoon Garden,0.758023,0,1,4,3,Ang Mo Kio,1.057960,"1.36716071992678,103.858658520644",44.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
926,2023-04,YISHUN,5 ROOM,850,YISHUN ST 81,07 TO 09,122.0,Improved,1988,64,...,Wisteria Mall,0.598721,0,1,0,6,Khatib,0.392469,"1.41603095097186,103.836239123144",42.0
927,2023-04,YISHUN,EXECUTIVE,258,YISHUN ST 22,01 TO 03,154.0,Maisonette,1985,61,...,Junction Nine,0.309598,1,2,0,7,Yishun,0.805676,"1.43515573230958,103.839804271982",53.0
928,2023-04,YISHUN,EXECUTIVE,293,YISHUN ST 22,01 TO 03,169.0,Apartment,1992,68,...,Junction Nine,0.526966,0,2,0,8,Yishun,0.758752,"1.43591598491658,103.837858420722",54.0
929,2023-04,YISHUN,EXECUTIVE,723,YISHUN ST 71,10 TO 12,142.0,Apartment,1986,62,...,Northpoint City,0.719731,0,1,1,5,Yishun,0.696611,"1.42601954522146,103.82993946372",47.0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 931 entries, 0 to 930
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   month                         931 non-null    object 
 1   town                          931 non-null    object 
 2   flat_type                     931 non-null    object 
 3   block                         931 non-null    object 
 4   street_name                   931 non-null    object 
 5   storey_range                  931 non-null    object 
 6   floor_area_sqm                931 non-null    float64
 7   flat_model                    931 non-null    object 
 8   lease_commence_date           931 non-null    int64  
 9   remaining_lease               931 non-null    int64  
 10  resale_price                  931 non-null    float64
 11  Latitude                      931 non-null    object 
 12  Longitude                     931 non-null    object 
 13  Addre

In [24]:
df.to_csv('AprilDataProcessed.csv',index=False) #Checkpoint

In [26]:
df[df['travel time to RP in minutes'].isnull()]

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,nearest mall,mall nearest distance,mall within 500m,mall within 1km,hawkers within 1km,schools within 1km,nearest MRT,distance to nearest MRT,LatLong,travel time to RP in minutes
48,2023-04,BEDOK,3 ROOM,33,BEDOK STH AVE 2,01 TO 03,73.0,New Generation,1978,54,...,Bedok Mall,1.117644,0,0,4,5,Tanah Merah,0.956256,"1.32283703302242,103.939124525951",
49,2023-04,BEDOK,3 ROOM,33,BEDOK STH AVE 2,04 TO 06,88.0,New Generation,1978,54,...,Bedok Mall,1.117644,0,0,4,5,Tanah Merah,0.956256,"1.32283703302242,103.939124525951",
56,2023-04,BEDOK,4 ROOM,133,BEDOK NTH AVE 3,13 TO 15,92.0,New Generation,1978,54,...,Djitsun Mall Bedok,0.706048,0,3,6,7,Bedok,0.728124,"1.32756257230307,103.93573462699",
63,2023-04,BEDOK,4 ROOM,187B,BEDOK NTH ST 4,10 TO 12,93.0,Model A,2018,94,...,Djitsun Mall Bedok,1.258851,0,0,3,9,Tanah Merah,0.803122,"1.3304987981324,103.939995686405",
64,2023-04,BEDOK,4 ROOM,187B,BEDOK NTH ST 4,07 TO 09,93.0,Model A,2018,94,...,Djitsun Mall Bedok,1.258851,0,0,3,9,Tanah Merah,0.803122,"1.3304987981324,103.939995686405",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
865,2023-04,YISHUN,3 ROOM,101,YISHUN AVE 5,01 TO 03,68.0,New Generation,1978,54,...,Northpoint City,0.977055,0,1,1,4,Yishun,0.831863,"1.43053231016112,103.827618665577",
871,2023-04,YISHUN,3 ROOM,165,YISHUN RING RD,07 TO 09,64.0,Simplified,1988,64,...,Canberra Plaza,0.868914,0,2,1,9,Yishun,0.766457,"1.43577180513264,103.831783821967",
876,2023-04,YISHUN,3 ROOM,235,YISHUN ST 21,04 TO 06,67.0,New Generation,1985,61,...,Junction Nine,0.368952,1,2,0,7,Yishun,0.637393,"1.43422889283755,103.838543994351",
880,2023-04,YISHUN,3 ROOM,761,YISHUN ST 72,04 TO 06,64.0,Simplified,1986,62,...,Northpoint City,0.433026,1,1,1,9,Yishun,0.511991,"1.42539842441665,103.833259035976",


In [57]:
%%time
import googlemaps
import time
from datetime import datetime
import urllib.parse
import gmap_key

# Define your API key and initialize the client
gmaps = googlemaps.Client(key=gmap_key.key)

for i in list(df[df['travel time to RP in minutes'].isnull()].index):

        # Define the origin and destination addresses
        origin = df['LatLong'].iloc[i]
        destination = 'Raffles Place MRT Station, Singapore'

        # Define the mode of transportation
        mode = 'transit'

        # Define the current time for departure time
        now = datetime(2023,4,14,12,0,0)

        try:
            # Call the Directions API to get the directions and travel time
            directions_result = gmaps.directions(origin, destination, mode=mode, departure_time=now)
            # Extract the duration and distance of the trip
            travel_time = directions_result[0]['legs'][0]['duration']['text']
            df.loc[i, 'travel time to RP in minutes'] = travel_time
        except:
            try:
                # Call the Directions API to get the directions and travel time
                # In case it doesnt work, try swapping destination and origin
                origin = df['nearest MRT'].iloc[i] + ' MRT Station'
                directions_result = gmaps.directions(destination, origin, mode=mode, departure_time=now)
                # Extract the duration and distance of the trip
                print(directions_result)
                travel_time = directions_result[0]['legs'][0]['duration']['text']
                df.loc[i, 'travel time to RP in minutes'] = travel_time
            except:
                #time.sleep(60)
                print('error', i)
                #print(gmaps.directions(destination, origin, mode=mode, departure_time=now))
        
        if((i+1)%300==0):
            print("Sleeping...", i)
            time.sleep(10)

CPU times: user 233 ms, sys: 40.9 ms, total: 274 ms
Wall time: 7.7 s


In [60]:
df.to_csv('AprilDataProcessed.csv',index=False) #Checkpoint