In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

# Web Scrapping - HK Population 2007-2018

Webpage:
https://www.ceicdata.com/en/indicator/hong-kong/population

The website contains a graph with Hong Kong popuation growth. 
After inspecting the content, you will see the graph is an image imported from another webpage. 

Steps:
1. Extract the link to the other webpage 
2. Extract the numbers of the graph (population) from the other webpage
3. Convert into DataFrame 

### 1. Extract the link to the other webpage 

In [2]:
url = 'https://www.ceicdata.com/en/indicator/hong-kong/population'
response = requests.get(url)
html_bytes = response.content
soup = BeautifulSoup(html_bytes,'html.parser')


In [3]:
hidden_link = (soup
               .find('div',class_='datapage-content')
               .find('div',class_='charts-container has-loading')
               .img['src'])
hidden_link

'https://www.ceicdata.com/datapage/charts/ipc_hong-kong_population/?type=area&from=2007-12-01&to=2018-12-01&lang=en'

### 2. Extract the numbers of the graph (population) from the other webpage

In [4]:
response_1 = requests.get(hidden_link)
html_bytes_1 = response_1.content
soup_1 = BeautifulSoup(html_bytes_1,'html.parser')
soup_1

<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg class="highcharts-root" height="500" style="font-family: 'Lucida Grande', 'Lucida Sans Unicode', Arial, Helvetica, sans-serif; font-size: 12px; background-color: rgba(0, 0, 0, 0);" version="1.1" width="1200" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><desc>Created with Highcharts 6.2.0</desc><defs><clippath id="highcharts-io67g3j-1"><rect fill="none" height="398" width="1113" x="0" y="0"></rect></clippath></defs><rect class="highcharts-background" fill="rgba(255, 255, 255, 0.0)" height="500" rx="0" ry="0" width="1180" x="0" y="0"></rect><rect class="highcharts-plot-background" fill="none" height="398" width="1113" x="57" y="10"></rect><g class="highcharts-grid highcharts-xaxis-grid" data-z-index="1"><path class="highcharts-grid-line" d="M 67.5 10 L 67.5 408" data-z-index="1" fill="none" opacity="1" stroke-dasharr

In [5]:
# Try out with first number
(soup_1
.find('g',class_="highcharts-data-labels highcharts-series-0 highcharts-area-series")
.find('g',class_="highcharts-label highcharts-data-label highcharts-data-label-color-undefined")
.tspan.contents)

['6.938']

In [6]:
broth = (soup_1
         .find('g',class_="highcharts-data-labels highcharts-series-0 highcharts-area-series")
         .find_all('g',class_="highcharts-label highcharts-data-label highcharts-data-label-color-undefined"))

population = []

for b in broth:
    population.append(b.tspan.contents[0])

In [7]:
population

['6.938',
 '6.964',
 '6.996',
 '7.052',
 '7.110',
 '7.171',
 '7.211',
 '7.253',
 '7.310',
 '7.377',
 '7.413',
 '7.486']

### 3. Convert to DataFrame

In [8]:
population_dict={}
count=0

for year in range(2007,2019):
    population_dict[year]=population[count]
    count+=1

In [9]:
population_dict

{2007: '6.938',
 2008: '6.964',
 2009: '6.996',
 2010: '7.052',
 2011: '7.110',
 2012: '7.171',
 2013: '7.211',
 2014: '7.253',
 2015: '7.310',
 2016: '7.377',
 2017: '7.413',
 2018: '7.486'}

In [12]:
hk_pop = pd.DataFrame(list(population_dict.items()), columns=['year', 'population'])
hk_pop

Unnamed: 0,year,population
0,2007,6.938
1,2008,6.964
2,2009,6.996
3,2010,7.052
4,2011,7.11
5,2012,7.171
6,2013,7.211
7,2014,7.253
8,2015,7.31
9,2016,7.377


In [13]:
# formatting population numbers
hk_pop['population'] = hk_pop['population'].str.replace('.','').astype(int)
hk_pop['population']=hk_pop['population']*1000

In [14]:
# adding 'city' column
hk_pop = hk_pop.assign(city='hong kong')
hk_pop

Unnamed: 0,year,population,city
0,2007,6938000,hong kong
1,2008,6964000,hong kong
2,2009,6996000,hong kong
3,2010,7052000,hong kong
4,2011,7110000,hong kong
5,2012,7171000,hong kong
6,2013,7211000,hong kong
7,2014,7253000,hong kong
8,2015,7310000,hong kong
9,2016,7377000,hong kong


In [15]:
hk_pop.to_pickle('.././pickles/hk_population.pkl')