In [1]:
import pandas as pd
import geopandas as gpd
from bs4 import BeautifulSoup

In [2]:
fname = "/home/prasanna/Downloads/hawker-centres/hawker-centres-geojson.geojson"
gdf = gpd.read_file(fname)

def extract_lat_long_from_point(p):
    # p.y - lat, p.x - long
    return pd.Series({"lat": p["geometry"].y, "long": p["geometry"].x})

gdf.loc[:, ['lat','long']] = gdf.apply(extract_lat_long_from_point, axis=1)

### Take a look at the content html
- parse with beautiful soup
- turn html table into dict
---

In [3]:
content = gdf.head(1)['Description'].iloc[0]
content

'<center><table><tr><th colspan=\'2\' align=\'center\'><em>Attributes</em></th></tr><tr bgcolor="#E3E3F3"> <th>ADDRESSBLOCKHOUSENUMBER</th> <td>85</td> </tr><tr bgcolor=""> <th>LATITUDE</th> <td></td> </tr><tr bgcolor="#E3E3F3"> <th>EST_ORIGINAL_COMPLETION_DATE</th> <td>30/6/1977</td> </tr><tr bgcolor=""> <th>STATUS</th> <td>Existing</td> </tr><tr bgcolor="#E3E3F3"> <th>CLEANINGSTARTDATE</th> <td></td> </tr><tr bgcolor=""> <th>ADDRESSUNITNUMBER</th> <td></td> </tr><tr bgcolor="#E3E3F3"> <th>ADDRESSFLOORNUMBER</th> <td></td> </tr><tr bgcolor=""> <th>NO_OF_FOOD_STALLS</th> <td></td> </tr><tr bgcolor="#E3E3F3"> <th>HYPERLINK</th> <td></td> </tr><tr bgcolor=""> <th>REGION</th> <td></td> </tr><tr bgcolor="#E3E3F3"> <th>APPROXIMATE_GFA</th> <td></td> </tr><tr bgcolor=""> <th>LONGITUDE</th> <td></td> </tr><tr bgcolor="#E3E3F3"> <th>INFO_ON_CO_LOCATORS</th> <td></td> </tr><tr bgcolor=""> <th>NO_OF_MARKET_STALLS</th> <td></td> </tr><tr bgcolor="#E3E3F3"> <th>AWARDED_DATE</th> <td></td> </tr><tr

In [4]:
with open("/tmp/out.html", "w") as f:
    f.write(str(BeautifulSoup(content)))
    f.write("\n")

In [5]:
from IPython.core.display import HTML
display(HTML(content))

Attributes,Attributes.1
ADDRESSBLOCKHOUSENUMBER,85
LATITUDE,
EST_ORIGINAL_COMPLETION_DATE,30/6/1977
STATUS,Existing
CLEANINGSTARTDATE,
ADDRESSUNITNUMBER,
ADDRESSFLOORNUMBER,
NO_OF_FOOD_STALLS,
HYPERLINK,
REGION,


### Parse HTML content with bs4
---

In [30]:
def html_table_to_dict(content):
    output_dct = {}
    soup = BeautifulSoup(content)
    for tr in soup.find_all("tr"):
        key = tr.find("th").text
        val = tr.find("td")
        safe_val = val.text if val else val
        output_dct[key] = safe_val
    return output_dct

In [37]:
def extract_cols(cont_dct, keys):
    return pd.Series({key:cont_dct.get(key) for key in keys})

In [36]:
html_table_to_dict(gdf['Description'].iloc[1])

{'Attributes': None,
 'ADDRESSBLOCKHOUSENUMBER': '85',
 'LATITUDE': '',
 'EST_ORIGINAL_COMPLETION_DATE': '4/4/1972',
 'STATUS': 'Existing',
 'CLEANINGSTARTDATE': '',
 'ADDRESSUNITNUMBER': '',
 'ADDRESSFLOORNUMBER': '',
 'NO_OF_FOOD_STALLS': '',
 'HYPERLINK': '',
 'REGION': '',
 'APPROXIMATE_GFA': '',
 'LONGITUDE': '',
 'INFO_ON_CO_LOCATORS': '',
 'NO_OF_MARKET_STALLS': '',
 'AWARDED_DATE': '',
 'LANDYADDRESSPOINT': '29972.02',
 'CLEANINGENDDATE': '',
 'PHOTOURL': 'http://www.nea.gov.sg/images/default-source/Hawker-Centres-Division/resize_1262153717849.jpg',
 'DESCRIPTION': 'HUP Reconfiguration',
 'NAME': 'Redhill Lane Blk 85 (Redhill Food Centre)',
 'ADDRESSTYPE': 'I',
 'RNR_STATUS': '',
 'ADDRESSBUILDINGNAME': '',
 'HUP_COMPLETION_DATE': '17/6/2005',
 'LANDXADDRESSPOINT': '26332.89',
 'ADDRESSSTREETNAME': 'Redhill Lane',
 'ADDRESSPOSTALCODE': '150085',
 'DESCRIPTION_MYENV': '',
 'IMPLEMENTATION_DATE': '',
 'ADDRESS_MYENV': 'Blk 85, Redhill Lane, Singapore 150085',
 'INC_CRC': '1D515CA

In [45]:
extract_cols(html_table_to_dict(content), ["NAME", "PHOTOURL"])

NAME         Bedok North Street 4 Blk 85 (85 Fengshan Centre)
PHOTOURL    http://www.nea.gov.sg/images/default-source/Ha...
dtype: object

In [42]:
cols = ['NAME','PHOTOURL','ADDRESS_MYENV']
gdf.loc[:, cols] = gdf.Description.apply(lambda x: extract_cols(cont_dct=html_table_to_dict(x), keys=cols))

In [43]:
gdf

Unnamed: 0,Name,Description,geometry,lat,long,NAME,PHOTOURL,ADDRESS_MYENV
0,kml_1,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.93873 1.33199 0.00000),1.331987,103.938733,Bedok North Street 4 Blk 85 (85 Fengshan Centre),http://www.nea.gov.sg/images/default-source/Ha...,"Blk 85, Bedok North Street 4, Singapore 460085"
1,kml_2,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.81834 1.28733 0.00000),1.287331,103.818339,Redhill Lane Blk 85 (Redhill Food Centre),http://www.nea.gov.sg/images/default-source/Ha...,"Blk 85, Redhill Lane, Singapore 150085"
2,kml_3,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.82899 1.37238 0.00000),1.372385,103.828994,Sembawang Hills Food Centre (Jalan Leban Food ...,http://www.nea.gov.sg/images/default-source/Ha...,"590, Upper Thomson Road, Singapore 574419"
3,kml_4,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.86674 1.36316 0.00000),1.363157,103.866737,Serangoon Garden Market,http://www.nea.gov.sg/images/default-source/Ha...,"49A, Serangoon Garden Way, Singapore 555945"
4,kml_5,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.83703 1.35201 0.00000),1.352007,103.837032,Shunfu Road Blk 320 (Shunfu Mart),http://www.nea.gov.sg/images/default-source/Ha...,"Blk 320, Shunfu Road, Singapore 570320"
...,...,...,...,...,...,...,...,...
120,kml_121,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.71845 1.34964 0.00000),1.349642,103.718448,Jurong West Street 52 Blk 505,http://www.nea.gov.sg/images/default-source/Ha...,"Blk 505, Jurong West Street 52, Singapore 640505"
121,kml_122,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.88413 1.30740 0.00000),1.307396,103.884130,Kallang Estate Fresh Market and Food Centre,http://www.nea.gov.sg/images/default-source/Ha...,"17, Old Airport Road, Singapore 397972"
122,kml_123,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.80070 1.43993 0.00000),1.439933,103.800696,Kampung Admiralty Hawker Centre,http://www.nea.gov.sg/images/default-source/Ha...,"Blk 676, Woodlands Drive 71, Singapore 730676"
123,kml_124,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.90634 1.30229 0.00000),1.302285,103.906339,Marine Parade Central Blk 84 (84 Marine Parade...,http://www.nea.gov.sg/images/default-source/Ha...,"Blk 84, Marine Parade Central, Singapore 440084"


In [47]:
gdf[['NAME','PHOTOURL','ADDRESS_MYENV','lat','long']].to_csv("/tmp/out.csv", index=False)

### END
---