# Constructing Kuala Lumpur Main Area

In [139]:
!pip install beautifulsoup4
##parse html
!pip install lxml 
!pip install html5lib
#request library
!pip install requests
!pip install bingmaps
!pip install geopy
!pip install folium



### 1. Import the necessary package

In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from bingmaps.apiservices import LocationByAddress
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium # for map randering
import numpy as np # library to handle data in a vectorized manner
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import json

For this purpose, we will utilise Bing Map API

In [140]:
#bing Map key
bmkey='xxxx'

### 2. Set parameter and inititate web scraping using pandas looking for table

For this purpose, we will utilize dataset from oour github member where he has 2,884 long list of roads in Kuala Lumpur. However, for our purposes, we want to find the main area in Kuala Lumpur only. Thus, this data needs to be trimmed down.

In [141]:
url='https://raw.githubusercontent.com/heiswayi/malaysia-postcodes/master/kuala_lumpur.csv'

In [142]:
df = pd.read_csv(url)
print(df.shape)
df

(2884, 4)


Unnamed: 0,POSTCODE,ADDRESS,CITY,STATE
0,50000,Bangunan Bangkok Bank,Kuala Lumpur,KUL
1,50000,Jalan Balai Polis,Kuala Lumpur,KUL
2,50000,Jalan Bandar,Kuala Lumpur,KUL
3,50000,Jalan Cheng Lock,Kuala Lumpur,KUL
4,50000,Jalan Hang Lekir,Kuala Lumpur,KUL
...,...,...,...,...
2879,60000,Medan Burhanuddin Helmi,Kuala Lumpur,KUL
2880,60000,Persiaran Abang Haji Openg,Kuala Lumpur,KUL
2881,60000,Persiaran Burhanuddin Helmi,Kuala Lumpur,KUL
2882,60000,Pinggir Zaaba,Kuala Lumpur,KUL


### 3. Cleanup the data

We only take one postcode for each area to avoid duplication

In [143]:
# Remove postcode duplicates
df = df.drop_duplicates(subset=['POSTCODE'])
# Drop Address Columnn as we are not using it
df = df.drop(['ADDRESS'], axis = 1)
# Reset index
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,POSTCODE,CITY,STATE
0,50000,Kuala Lumpur,KUL
1,50050,Kuala Lumpur,KUL
2,50088,Kuala Lumpur,KUL
3,50100,Kuala Lumpur,KUL
4,50150,Kuala Lumpur,KUL
...,...,...,...
272,59200,Kuala Lumpur,KUL
273,59700,Kuala Lumpur,KUL
274,59800,Kuala Lumpur,KUL
275,59990,Kuala Lumpur,KUL


We will find location (longitude and latitude) for each address using bing map

In [144]:
postalCode = '50050'
countryRegion = 'Malaysia'
adminDistrict = '50050 Kuala Lumpur'

code = df['POSTCODE']
code_lat=[]
code_long=[]
for i in code :
    data = {'adminDistrict': adminDistrict,'postalCode': i, 'countryRegion': countryRegion,'key': bmkey}
    loc_by_address = LocationByAddress(data)
    loc_by_address.get_coordinates
    for coord in loc_by_address.get_coordinates:
        code_lat += [coord.latitude]
        code_long += [coord.longitude]
        break        
        
df2 = pd.DataFrame (code_lat,columns=['Latitude'])
df3 = pd.DataFrame (code_long,columns=['Longitude'])


df4= df2.join(df3)
df=df.join(df4)
df

Unnamed: 0,POSTCODE,CITY,STATE,Latitude,Longitude
0,50000,Kuala Lumpur,KUL,3.142934,101.696373
1,50050,Kuala Lumpur,KUL,3.147618,101.696281
2,50088,Kuala Lumpur,KUL,3.155141,101.711540
3,50100,Kuala Lumpur,KUL,3.152210,101.696510
4,50150,Kuala Lumpur,KUL,3.191760,101.658691
...,...,...,...,...,...
272,59200,Kuala Lumpur,KUL,3.105824,101.666428
273,59700,Kuala Lumpur,KUL,3.143822,101.689590
274,59800,Kuala Lumpur,KUL,3.143822,101.689590
275,59990,Kuala Lumpur,KUL,3.117759,101.667549


In [145]:
# joining longitude and Latitude for ease of reverse geocode
df['LongLat'] = df[df.columns[3:]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
df                                           

Unnamed: 0,POSTCODE,CITY,STATE,Latitude,Longitude,LongLat
0,50000,Kuala Lumpur,KUL,3.142934,101.696373,"3.1429340839385986,101.69637298583984"
1,50050,Kuala Lumpur,KUL,3.147618,101.696281,"3.147618055343628,101.69628143310547"
2,50088,Kuala Lumpur,KUL,3.155141,101.711540,"3.1551411151885986,101.71154022216797"
3,50100,Kuala Lumpur,KUL,3.152210,101.696510,"3.152209997177124,101.6965103149414"
4,50150,Kuala Lumpur,KUL,3.191760,101.658691,"3.1917600631713867,101.65869140625"
...,...,...,...,...,...,...
272,59200,Kuala Lumpur,KUL,3.105824,101.666428,"3.1058239936828613,101.66642761230469"
273,59700,Kuala Lumpur,KUL,3.143822,101.689590,"3.143821954727173,101.68959045410156"
274,59800,Kuala Lumpur,KUL,3.143822,101.689590,"3.143821954727173,101.68959045410156"
275,59990,Kuala Lumpur,KUL,3.117759,101.667549,"3.1177589893341064,101.66754913330078"


After getting Longitude and Latitude, we would like to increase the accuracy of the address given earlier. thus we do a reverse geocode based on Latitude and Longitude to find the location.

In [146]:
from bingmaps.apiservices import LocationByPoint
Location =[]
for i in df.LongLat:
    coordinate= i
    data1 = {'point': coordinate,'includeEntityTypes': 'Address','key': bmkey}
    loc_by_point = LocationByPoint(data1)
    Location0 = [loc_by_point.get_address]
    try:
        Location1 = Location0[0][0]
    except:
        Location1 = "NA"
    try:
        Location2 = Location1['intersection'].get('displayName')
    except:
        Location2 = "NA"
    print(Location2)
    Location += [Location2]

Jalan Tun H.S. Lee and Jalan Sultan
Jalan Tun Tan Siew Tin, between Medan Pasar Besar and Jalan Tun H.S. Lee
NA
Lorong Tuanku Abdul Rahman Kiri 2 and Lorong Tuanku Abdul Rahman
NA
Jalan Imbi and Jalan Sultan Ismail
NA
NA
Lorong Kolam Air 7 and Jalan Kolam Air 6
Jalan Pangkur, between Jalan Cemur and Jalan 1/68D
Jalan Ampang, between Jalan Binjai and Jalan Tun Razak
NA
Jalan Tun Sambanthan and Jalan Stesen Sentral 5
Jalan Serene Kiara 8 and Jalan Serene Kiara 6
Lorong Derumun and Persiaran Derumun
Jalan Syed Putra and Jalan Istana
Jalan Tuanku Abdul Rahman, between Jalan Mara and Jalan Dang Wangi
Jalan Raja Laut and Jalan Sri Amar
NA
NA
NA
Jalan Gereja, between Jalan Melaka and Lorong Ampang
Jalan Tangsi, between Jalan Parlimen and Jalan Kinabalu
Jalan Kampung Attap, between Jalan Tuba and Jalan Manau
Jalan Merpati, between Jalan Tuanku Abdul Rahman and Jalan Raja Laut
NA
NA
NA
Jalan Semantan, between E23 and Lebuhraya Sprint
Jalan Ampang, between Jalan Ampang Tengah and Jalan Jelatek
J

In [147]:
df5 = pd.DataFrame (Location,columns=['Location']) #convert the list to dataframe
df5.shape

(277, 1)

In [148]:
print(df.shape,df5.shape)
df =df.join(df5) #join the location with existing dataframe
df.head()
print(df.shape)

(277, 6) (277, 1)
(277, 7)


In [149]:
df = df[df.Location != 'NA'] #drop NA in Location
#df = df.drop_duplicates(subset=['Location']) #drop duplicate Location
df.reset_index(drop=True, inplace=True) #reset index
df

Unnamed: 0,POSTCODE,CITY,STATE,Latitude,Longitude,LongLat,Location
0,50000,Kuala Lumpur,KUL,3.142934,101.696373,"3.1429340839385986,101.69637298583984",Jalan Tun H.S. Lee and Jalan Sultan
1,50050,Kuala Lumpur,KUL,3.147618,101.696281,"3.147618055343628,101.69628143310547","Jalan Tun Tan Siew Tin, between Medan Pasar Be..."
2,50100,Kuala Lumpur,KUL,3.152210,101.696510,"3.152209997177124,101.6965103149414",Lorong Tuanku Abdul Rahman Kiri 2 and Lorong T...
3,50200,Kuala Lumpur,KUL,3.143709,101.711952,"3.1437089443206787,101.71195220947266",Jalan Imbi and Jalan Sultan Ismail
4,50350,Kuala Lumpur,KUL,3.173631,101.685585,"3.173630952835083,101.68558502197266",Lorong Kolam Air 7 and Jalan Kolam Air 6
...,...,...,...,...,...,...,...
74,57100,Kuala Lumpur,KUL,3.115582,101.721519,"3.11558198928833,101.72151947021484","Lorong Ikan Emas, between Jalan Ikan Emas and ..."
75,58000,Kuala Lumpur,KUL,3.114526,101.682671,"3.1145260334014893,101.68267059326172","Jalan Teluk Batu, between Jalan Teluk Pulai an..."
76,59000,Kuala Lumpur,KUL,3.128453,101.675003,"3.128453016281128,101.67500305175781","Lorong Maarof, between Jalan Bukit Bangsar and..."
77,59200,Kuala Lumpur,KUL,3.105824,101.666428,"3.1058239936828613,101.66642761230469",Jalan Pantai Permai 12 and Jalan Pantai Permai 7


Final list has been trimmed down to 78 rows only!

### 4. Visualize the Location in Folium

In [150]:
address = 'Kuala Lumpur, MY'

geolocator = Nominatim(user_agent="my_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Kuala Lumput are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Kuala Lumput are 3.1516964, 101.6942371.


In [151]:
# create map of Toronto using latitude and longitude values
map_KL = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, location, postcode in zip(df['Latitude'], df['Longitude'], df['Location'], df['POSTCODE']):
# The zip() function returns a zip object, 
# which is an iterator of tuples where the first item in each passed iterator is paired together, 
# and then the second item in each passed iterator are paired together etc.
# If the passed iterators have different lengths, the iterator with the least items decides the length of the new iterator.
    label = '{}, {}'.format(location, postcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_KL)  
    
map_KL

### 5. Export to CSV

In [152]:
df.to_csv(r'C:\Users\xxxx\Desktop\KL_Main_Area.csv')
print("Export Completed")

Export Completed
