In [1]:
import pandas as pd
from geopy.distance import geodesic
import numpy as np
np.random.seed(42)

In [2]:
#load both dataframe with hdb gps lat,long and
#mrt csv
df = pd.read_csv('data/df_gps.csv')
df_mrt = pd.read_csv('data/mrt.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,date,year_sold,month_sold,remaining_lease,address,latitude,longitude
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,improved,1977,9000.0,1990-01-01,1990,1,86,309 ang mo kio ave 1,1.366045,103.83697
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,improved,1977,6000.0,1990-01-01,1990,1,86,309 ang mo kio ave 1,1.366045,103.83697
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,improved,1977,8000.0,1990-01-01,1990,1,86,309 ang mo kio ave 1,1.366045,103.83697
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,improved,1977,6000.0,1990-01-01,1990,1,86,309 ang mo kio ave 1,1.366045,103.83697
4,1990-02,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,improved,1977,8000.0,1990-02-01,1990,2,86,309 ang mo kio ave 1,1.366045,103.83697


In [4]:
# obtain unique address and subset to ['address','latitude','longitude'] columns
# this is to save a lot of time from computing almost 1m rows
df_unique = df.drop_duplicates(['address','latitude','longitude'])[['address','latitude','longitude']]

In [5]:
df_unique.head()

Unnamed: 0,address,latitude,longitude
0,309 ang mo kio ave 1,1.366045,103.83697
231,216 ang mo kio ave 1,1.366197,103.841505
414,211 ang mo kio ave 3,1.369197,103.841667
693,202 ang mo kio ave 3,1.368446,103.844516
860,235 ang mo kio ave 3,1.366824,103.836491


In [6]:
import json
import requests

def get_gps(df,col):
    '''
    This function will obtain the latitude and longitude using oneMAP api
    
    Keyword arguments:
    df : df name
    col : name of the column to retrieve
    '''
    locations = list(df[col])
    latitude_list = []
    longitude_list = []
    for location in locations:
        url= "https://developers.onemap.sg/commonapi/search?returnGeom=Y&getAddrDetails=Y&pageNum=1&searchVal="+location
        response = requests.get(url)
        data = json.loads(response.text) 
        try:
            latitude_list.append(data['results'][0]['LATITUDE'])
            longitude_list.append(data['results'][0]['LONGITUDE'])
          
        except:
            latitude_list.append(None)
            longitude_list.append(None)
    
    df["latitude"] = latitude_list
    df["longitude"] = longitude_list
    
    return df

In [7]:
df_mrt.head()

Unnamed: 0,MRT,latitude,longitude
0,jurong east mrt,1.334009,103.741735
1,bukit batok mrt,1.349361,103.749966
2,bukit gombak mrt,1.358694,103.752085
3,choa chu kang mrt,1.38497,103.744592
4,yew tee mrt,1.397535,103.747405


In [8]:
# obtain MRT GPS
df_mrt = get_gps(df_mrt,'MRT')
df_mrt.head()
df_mrt.to_csv('data/mrt_done.csv',index=False)

In [12]:
def nearest_loc(lat,long,df_loc_compare):
    '''
    This function returns 
    nearest location,nearest location distance, 
    number of loc that is less than half km,
    number of loc that is less than one km,
    
    Keyword arguments:
    lat -- latitude of the location
    long -- longitude of the location
    df_loc_compare -- (target) name of the df(i.e mrt or shopping mall)
                      column 'latitude' and 'longitude' must exist
       
    '''
    flat_loc = (lat,long)
    loc_dict = {}
    for _,row in df_loc_compare.iterrows():     
        mrt_loc=(row['latitude'],row['longitude'])
        loc_dict[row['MRT']] = geodesic(flat_loc,mrt_loc).km           
    less_than_half_km = len([loc for loc,distance in loc_dict.items() if distance < 0.5])
    less_than_one_km    = len([loc for loc,distance in loc_dict.items() if distance < 1])
    nearest_loc = min(loc_dict, key=loc_dict.get)
    nearest_loc_distance = min(loc_dict.values())
    return nearest_loc,nearest_loc_distance,less_than_half_km,less_than_one_km 

In [13]:
df_unique['mrt'], df_unique['nearest_mrt_distance'],\
df_unique['mrt_less_than_half_km'],df_unique['mrt_less_than_one_km']\
= zip(*df_unique.apply(lambda x : nearest_loc(x['latitude'],x['longitude'],df_mrt),axis=1))

In [14]:
df_unique.head()

Unnamed: 0,address,latitude,longitude,mrt,nearest_mrt_distance,mrt_less_than_half_km,mrt_less_than_one_km
0,309 ang mo kio ave 1,1.366045,103.83697,bright hill mrt,0.546874,0,2
231,216 ang mo kio ave 1,1.366197,103.841505,mayflower mrt,0.800629,0,2
414,211 ang mo kio ave 3,1.369197,103.841667,mayflower mrt,0.620303,0,2
693,202 ang mo kio ave 3,1.368446,103.844516,ang mo kio mrt,0.580039,0,2
860,235 ang mo kio ave 3,1.366824,103.836491,mayflower mrt,0.513092,0,2


In [15]:
df_unique.isnull().sum()

address                  0
latitude                 0
longitude                0
mrt                      0
nearest_mrt_distance     0
mrt_less_than_half_km    0
mrt_less_than_one_km     0
dtype: int64

In [16]:
df_unique.to_csv('data/unique_address_mrt.csv',index=False)

In [17]:
columns = ['address','mrt','nearest_mrt_distance', 'mrt_less_than_half_km','mrt_less_than_one_km']

In [18]:
df_unique=df_unique[columns]

In [19]:
df_unique.head()

Unnamed: 0,address,mrt,nearest_mrt_distance,mrt_less_than_half_km,mrt_less_than_one_km
0,309 ang mo kio ave 1,bright hill mrt,0.546874,0,2
231,216 ang mo kio ave 1,mayflower mrt,0.800629,0,2
414,211 ang mo kio ave 3,mayflower mrt,0.620303,0,2
693,202 ang mo kio ave 3,ang mo kio mrt,0.580039,0,2
860,235 ang mo kio ave 3,mayflower mrt,0.513092,0,2


In [20]:
df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,date,year_sold,month_sold,remaining_lease,address,latitude,longitude
0,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,improved,1977,9000.0,1990-01-01,1990,1,86,309 ang mo kio ave 1,1.366045,103.83697
1,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,improved,1977,6000.0,1990-01-01,1990,1,86,309 ang mo kio ave 1,1.366045,103.83697
2,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,improved,1977,8000.0,1990-01-01,1990,1,86,309 ang mo kio ave 1,1.366045,103.83697
3,1990-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,07 TO 09,31.0,improved,1977,6000.0,1990-01-01,1990,1,86,309 ang mo kio ave 1,1.366045,103.83697
4,1990-02,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,04 TO 06,31.0,improved,1977,8000.0,1990-02-01,1990,2,86,309 ang mo kio ave 1,1.366045,103.83697


In [21]:
# merge df_unique to df . saves us a lot of time
df = pd.merge(df, df_unique, on='address',how='left')

In [22]:
#check if there is any null value
df.isnull().sum()

month                    0
town                     0
flat_type                0
block                    0
street_name              0
storey_range             0
floor_area_sqm           0
flat_model               0
lease_commence_date      0
resale_price             0
date                     0
year_sold                0
month_sold               0
remaining_lease          0
address                  0
latitude                 0
longitude                0
mrt                      0
nearest_mrt_distance     0
mrt_less_than_half_km    0
mrt_less_than_one_km     0
dtype: int64

In [23]:
df.to_csv('data/address_mrt.csv',index=False)

In [None]:
#end