### Question 1
Try to harvest the historical weather trend from the Hong Kong Observatory
(https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal10.htm#) [80%]

In [28]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
import numpy as np

In [10]:
def to_matrix (alist, length):
    """Split and fold a 1-d list into a 2-d matrix.
    """
    return [alist[i:i+length] for i in range(0, len(alist), length)]

In [36]:
url = "https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal10.htm"
req = requests.get(url)
req.encoding = "utf-8"

In [37]:
soup = BeautifulSoup(req.content)
weather_tables = soup.find_all("table", attrs={"class": "data_table border-1_table"}) 
# weather_tables is a list of length 2, which contains 2 sub-tables
# notice that the find_all() method will return a list contains objects of BeautifulSoup class; while find() only returns one object
# (we didn't involve "class" and "object" during the lecture; to be short and vivid, "AIDM" can be regarded as a class and "AIDMer"s can be regarded as objects of the "AIDM" class)

In [41]:
t1_content = [td_item.text for td_item in weather_tables[0].find_all("td")] 
# here we uses list comprehension, which is very convenient to create a new list
# in the first weather_table, all figures and characters we is stored within the tag <td>...</td>
t1_matrix = np.array(t1_content[:-2]).reshape((-1,10))
# pop out "Observed at" and "Hong Kong Observatory"
# after making the list into a 2-d matrix, it will be convenient to build a dataframe

In [24]:
# this is another way to convert 1-d to 2-d
# t1_content.pop()
# t1_content.pop()
# t1_matrix = to_matrix(t1_content, 10) # convert the 1-d list into a 2-d matrix

In [42]:
t1_df = pd.DataFrame(t1_matrix, columns=["Date", "Mean Presure(hPa)", "Mean Maximum(deg.C)", "Mean(deg.C)", "Mean Minimum(deg.C)", "Web Bulb(deg.C)", "Dew Point(deg.C)", "Relative Humidity(%)", "Mean Daily Rainfall(mm)", "Amount of Cloud(%)"])
# a matrix can be converted into a dataframe conveniently

In [43]:
t1_df

Unnamed: 0,Date,Mean Presure(hPa),Mean Maximum(deg.C),Mean(deg.C),Mean Minimum(deg.C),Web Bulb(deg.C),Dew Point(deg.C),Relative Humidity(%),Mean Daily Rainfall(mm),Amount of Cloud(%)
0,1 Oct,1012.0,29.1,26.8,25.1,23.7,22.3,77,6.0,64
1,2 Oct,1012.3,29.1,26.8,25.1,23.7,22.1,76,4.3,63
2,3 Oct,1012.4,29.1,26.8,25.0,23.6,21.9,76,5.8,62
3,4 Oct,1012.5,29.0,26.7,24.9,23.4,21.7,75,6.5,61
4,5 Oct,1012.5,28.9,26.6,24.8,23.2,21.4,74,5.8,60
5,6 Oct,1012.6,28.8,26.5,24.7,23.0,21.2,74,4.6,59
6,7 Oct,1012.8,28.7,26.4,24.5,22.8,20.9,73,4.4,59
7,8 Oct,1013.0,28.6,26.3,24.4,22.7,20.7,73,2.7,57
8,9 Oct,1013.3,28.5,26.3,24.5,22.7,20.8,73,2.3,57
9,10 Oct,1013.5,28.5,26.2,24.5,22.8,20.9,74,2.6,57


In [46]:
t2_content = [td_item.text for td_item in weather_tables[1].find_all("td")] 
# in the second weather_table, all figures and characters we is stored within the tag <td>...</td> as well
t2_matrix = np.array(t2_content[:-4]).reshape((-1,6))

In [37]:
# another way to build the matrix
# t2_content = t2_content[:-4] # delete the last 4 items in the list at one time
# t2_matrix = to_matrix(t2_content, 6) # convert the 1-d list into a 2-d matrix

In [47]:
t2_df = pd.DataFrame(t2_matrix, columns=["Date", "Bright Sunshine Duration(hours)", "Prevailing Direction(degrees)", "Mean Speed(km/h)", "AM(deg. C)", "PM(deg. C)"])

In [48]:
t2_df

Unnamed: 0,Date,Bright Sunshine Duration(hours),Prevailing Direction(degrees),Mean Speed(km/h),AM(deg. C),PM(deg. C)
0,1 Oct,5.6,80,27,27.1,27.5
1,2 Oct,5.7,80,27,27.1,27.4
2,3 Oct,5.9,80,27,27.1,27.3
3,4 Oct,6.0,80,26,27.0,27.3
4,5 Oct,6.2,80,24,27.0,27.3
5,6 Oct,6.3,90,23,27.0,27.3
6,7 Oct,6.3,90,23,27.0,27.3
7,8 Oct,6.4,90,24,26.9,27.3
8,9 Oct,6.4,80,25,26.9,27.3
9,10 Oct,6.4,80,27,26.8,27.2


In [26]:
tables = t1_df.merge(t2_df)
tables

Unnamed: 0,Date,Mean Presure(hPa),Mean Maximum(deg.C),Mean(deg.C),Mean Minimum(deg.C),Web Bulb(deg.C),Dew Point(deg.C),Relative Humidity(%),Mean Daily Rainfall(mm),Amount of Cloud(%),Bright Sunshine Duration(hours),Prevailing Direction(degrees),Mean Speed(km/h),AM(deg. C),PM(deg. C)
0,1 Jan,1020.1,19.3,17.0,15.2,14.3,11.8,73,1.2,52,5.3,060,24,18.3,18.6
1,2 Jan,1020.3,19.3,17.1,15.3,14.3,11.8,72,0.7,51,5.5,070,25,18.2,18.5
2,3 Jan,1020.3,19.3,17.1,15.3,14.3,11.8,72,0.8,51,5.4,070,25,18.1,18.4
3,4 Jan,1020.3,19.2,17.0,15.3,14.3,11.8,73,1.0,53,5.3,070,26,18.0,18.3
4,5 Jan,1020.4,19.2,17.0,15.2,14.3,11.8,73,1.1,54,5.4,070,26,17.9,18.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,27 Dec,1020.5,19.3,17.0,15.0,14.1,11.3,71,1.0,54,5.1,070,26,18.6,18.9
362,28 Dec,1020.3,19.1,16.9,14.9,14.1,11.4,72,1.4,55,4.8,070,26,18.6,18.9
363,29 Dec,1020.2,19.1,16.8,14.8,14.0,11.4,72,1.5,54,4.9,070,25,18.4,18.8
364,30 Dec,1020.2,19.1,16.8,14.9,14.0,11.5,72,1.4,53,5.1,060,25,18.4,18.7


### Question 2
For (1), describe how to harvest all the monthly data from different months and different
years? (No code needed) [20%]

All the URLs to the corresponding pages are in the same pattern, i.e. "https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal" + number+ ".htm"  
So the pages can be accessed with an iteration, and then get the content with similar methods to question 1.

In [49]:
for num in range(1,13):
    url = "https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal" + "{:02d}".format(num) + ".htm"
    print(url)
    req = requests.get(url)
    req.encoding = "utf-8"
    soup = BeautifulSoup(req.content)
    weather_tables = soup.find_all("table", attrs={"class": "data_table border-1_table"})
    t1_content = [td_item.text for td_item in weather_tables[0].find_all("td")] 
    t1_matrix = np.array(t1_content[:-2]).reshape((-1,10))
    t2_content = [td_item.text for td_item in weather_tables[1].find_all("td")] 
    t2_matrix = np.array(t2_content[:-4]).reshape((-1,6))
    if num == 1:
        t1_df = pd.DataFrame(t1_matrix, columns=["Date", "Mean Presure(hPa)", "Mean Maximum(deg.C)", "Mean(deg.C)", "Mean Minimum(deg.C)", "Web Bulb(deg.C)", "Dew Point(deg.C)", "Relative Humidity(%)", "Mean Daily Rainfall(mm)", "Amount of Cloud(%)"])
        t2_df = pd.DataFrame(t2_matrix, columns=["Date", "Bright Sunshine Duration(hours)", "Prevailing Direction(degrees)", "Mean Speed(km/h)", "AM(deg. C)", "PM(deg. C)"])
    else:
        temp_df1 = pd.DataFrame(t1_matrix, columns=["Date", "Mean Presure(hPa)", "Mean Maximum(deg.C)", "Mean(deg.C)", "Mean Minimum(deg.C)", "Web Bulb(deg.C)", "Dew Point(deg.C)", "Relative Humidity(%)", "Mean Daily Rainfall(mm)", "Amount of Cloud(%)"])
        t1_df = t1_df.append(temp_df1, ignore_index = True)
        temp_df2 = pd.DataFrame(t2_matrix, columns=["Date", "Bright Sunshine Duration(hours)", "Prevailing Direction(degrees)", "Mean Speed(km/h)", "AM(deg. C)", "PM(deg. C)"])
        t2_df = t2_df.append(temp_df2, ignore_index = True)

https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal01.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal02.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal03.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal04.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal05.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal06.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal07.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal08.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal09.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal10.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal11.htm
https://www.hko.gov.hk/en/cis/normal/1981_2010/dnormal12.htm


In [50]:
t1_df

Unnamed: 0,Date,Mean Presure(hPa),Mean Maximum(deg.C),Mean(deg.C),Mean Minimum(deg.C),Web Bulb(deg.C),Dew Point(deg.C),Relative Humidity(%),Mean Daily Rainfall(mm),Amount of Cloud(%)
0,1 Jan,1020.1,19.3,17.0,15.2,14.3,11.8,73,1.2,52
1,2 Jan,1020.3,19.3,17.1,15.3,14.3,11.8,72,0.7,51
2,3 Jan,1020.3,19.3,17.1,15.3,14.3,11.8,72,0.8,51
3,4 Jan,1020.3,19.2,17.0,15.3,14.3,11.8,73,1.0,53
4,5 Jan,1020.4,19.2,17.0,15.2,14.3,11.8,73,1.1,54
...,...,...,...,...,...,...,...,...,...,...
361,27 Dec,1020.5,19.3,17.0,15.0,14.1,11.3,71,1.0,54
362,28 Dec,1020.3,19.1,16.9,14.9,14.1,11.4,72,1.4,55
363,29 Dec,1020.2,19.1,16.8,14.8,14.0,11.4,72,1.5,54
364,30 Dec,1020.2,19.1,16.8,14.9,14.0,11.5,72,1.4,53


In [51]:
t2_df

Unnamed: 0,Date,Bright Sunshine Duration(hours),Prevailing Direction(degrees),Mean Speed(km/h),AM(deg. C),PM(deg. C)
0,1 Jan,5.3,060,24,18.3,18.6
1,2 Jan,5.5,070,25,18.2,18.5
2,3 Jan,5.4,070,25,18.1,18.4
3,4 Jan,5.3,070,26,18.0,18.3
4,5 Jan,5.4,070,26,17.9,18.2
...,...,...,...,...,...,...
361,27 Dec,5.1,070,26,18.6,18.9
362,28 Dec,4.8,070,26,18.6,18.9
363,29 Dec,4.9,070,25,18.4,18.8
364,30 Dec,5.1,060,25,18.4,18.7


In [52]:
tables = t1_df.merge(t2_df)
tables

Unnamed: 0,Date,Mean Presure(hPa),Mean Maximum(deg.C),Mean(deg.C),Mean Minimum(deg.C),Web Bulb(deg.C),Dew Point(deg.C),Relative Humidity(%),Mean Daily Rainfall(mm),Amount of Cloud(%),Bright Sunshine Duration(hours),Prevailing Direction(degrees),Mean Speed(km/h),AM(deg. C),PM(deg. C)
0,1 Jan,1020.1,19.3,17.0,15.2,14.3,11.8,73,1.2,52,5.3,060,24,18.3,18.6
1,2 Jan,1020.3,19.3,17.1,15.3,14.3,11.8,72,0.7,51,5.5,070,25,18.2,18.5
2,3 Jan,1020.3,19.3,17.1,15.3,14.3,11.8,72,0.8,51,5.4,070,25,18.1,18.4
3,4 Jan,1020.3,19.2,17.0,15.3,14.3,11.8,73,1.0,53,5.3,070,26,18.0,18.3
4,5 Jan,1020.4,19.2,17.0,15.2,14.3,11.8,73,1.1,54,5.4,070,26,17.9,18.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,27 Dec,1020.5,19.3,17.0,15.0,14.1,11.3,71,1.0,54,5.1,070,26,18.6,18.9
362,28 Dec,1020.3,19.1,16.9,14.9,14.1,11.4,72,1.4,55,4.8,070,26,18.6,18.9
363,29 Dec,1020.2,19.1,16.8,14.8,14.0,11.4,72,1.5,54,4.9,070,25,18.4,18.8
364,30 Dec,1020.2,19.1,16.8,14.9,14.0,11.5,72,1.4,53,5.1,060,25,18.4,18.7


*there are also many other ways to solve this problem, you may have a look at each other's work*