In [21]:
# 4 Major Python Libraries for Web Crawling
# (1) Pandas - Parsing HTML Tables
# (2) Request - Parsing HTML Codes
# (3) BeautifulSoup - Analyzing HTML Codes
# (4) Selenium - Automating Browser Activities

# All labs in these lessons are meant for demonstrating web crawling techniques only.
# Please Google and try to understand in details the ethics and best practice for web crawling.
# e.g. https://sunscrapers.com/blog/web-crawling-scraping-best-practices/

In [22]:
import pandas as pd
import numpy as np
from selenium import webdriver
from bs4 import BeautifulSoup
import re

In [23]:
driver = webdriver.Chrome('./chromedriver')
url = "http://www.etnet.com.hk/www/tc/stocks/industry_adu.php"
driver.get(url)
html = driver.page_source
driver.quit()
df = pd.read_html(html)

In [24]:
df[0]

Unnamed: 0,0,1,2,3,4,5,6
0,行業,平均升/跌幅,上升/下跌/不變/無成交股票,成交金額,佔大市%,上日成交額,佔大市%
1,石油及天然氣,-0.562%,1322411,2463995519,2.389,2822348084,2.974
2,13,22,4,11,,,
3,煤炭,-1.712%,61223,1013858374,0.983,1215029237,1.280
4,6,12,2,3,,,
...,...,...,...,...,...,...,...
62,1,11,2,2,,,
63,綜合企業,-1.390%,51336,590486404,0.572,808553487,0.852
64,5,13,3,6,,,
65,其他,-0.222%,15,3421250,0.003,0,0.000


<h2>Technique: Data Cleansing</h2>

In [28]:
# Drop irrelevant rows that were results of formatting
HK_industry = df[0].dropna(subset=[6])
HK_industry

Unnamed: 0,0,1,2,3,4,5,6
0,行業,平均升/跌幅,上升/下跌/不變/無成交股票,成交金額,佔大市%,上日成交額,佔大市%
1,石油及天然氣,-0.562%,1322411,2463995519,2.389,2822348084,2.974
3,煤炭,-1.712%,61223,1013858374,0.983,1215029237,1.280
5,黃金及貴金屬,+0.370%,4244,425822726,0.413,358771842,0.378
7,一般金屬及礦石,-0.427%,1922520,607063762,0.588,614162866,0.647
9,紙及林業產品,-1.219%,3536,62744422,0.061,90573733,0.095
11,化學製品,+0.026%,119210,66205171,0.064,64576580,0.068
13,工業,-0.470%,31442039,1256202583,1.218,1400471279,1.476
15,汽車,-0.517%,21311012,5190492155,5.032,4236241936,4.464
17,電器及消閒電子產品,+0.217%,1311612,785067702,0.761,803705299,0.847


In [29]:
# Rename columns
HK_industry.columns = ["行業", "平均升/跌幅", "上升/下跌/不變/無成交股票",
                       "成交金額", "成交金額佔大市%", "上日成交額", "上日成交額佔大市%"]
HK_industry

Unnamed: 0,行業,平均升/跌幅,上升/下跌/不變/無成交股票,成交金額,成交金額佔大市%,上日成交額,上日成交額佔大市%
0,行業,平均升/跌幅,上升/下跌/不變/無成交股票,成交金額,佔大市%,上日成交額,佔大市%
1,石油及天然氣,-0.562%,1322411,2463995519,2.389,2822348084,2.974
3,煤炭,-1.712%,61223,1013858374,0.983,1215029237,1.280
5,黃金及貴金屬,+0.370%,4244,425822726,0.413,358771842,0.378
7,一般金屬及礦石,-0.427%,1922520,607063762,0.588,614162866,0.647
9,紙及林業產品,-1.219%,3536,62744422,0.061,90573733,0.095
11,化學製品,+0.026%,119210,66205171,0.064,64576580,0.068
13,工業,-0.470%,31442039,1256202583,1.218,1400471279,1.476
15,汽車,-0.517%,21311012,5190492155,5.032,4236241936,4.464
17,電器及消閒電子產品,+0.217%,1311612,785067702,0.761,803705299,0.847


In [30]:
# Drop irrelevant header row
HK_industry = HK_industry.drop(0)
HK_industry

Unnamed: 0,行業,平均升/跌幅,上升/下跌/不變/無成交股票,成交金額,成交金額佔大市%,上日成交額,上日成交額佔大市%
1,石油及天然氣,-0.562%,1322411,2463995519,2.389,2822348084,2.974
3,煤炭,-1.712%,61223,1013858374,0.983,1215029237,1.28
5,黃金及貴金屬,+0.370%,4244,425822726,0.413,358771842,0.378
7,一般金屬及礦石,-0.427%,1922520,607063762,0.588,614162866,0.647
9,紙及林業產品,-1.219%,3536,62744422,0.061,90573733,0.095
11,化學製品,+0.026%,119210,66205171,0.064,64576580,0.068
13,工業,-0.470%,31442039,1256202583,1.218,1400471279,1.476
15,汽車,-0.517%,21311012,5190492155,5.032,4236241936,4.464
17,電器及消閒電子產品,+0.217%,1311612,785067702,0.761,803705299,0.847
19,紡織、服裝及配飾,+0.529%,39301540,2099564287,2.035,2506992725,2.642


In [31]:
# Drop irrelevant columns
HK_industry = HK_industry.drop(["上升/下跌/不變/無成交股票"], axis=1)
HK_industry

Unnamed: 0,行業,平均升/跌幅,成交金額,成交金額佔大市%,上日成交額,上日成交額佔大市%
1,石油及天然氣,-0.562%,2463995519,2.389,2822348084,2.974
3,煤炭,-1.712%,1013858374,0.983,1215029237,1.28
5,黃金及貴金屬,+0.370%,425822726,0.413,358771842,0.378
7,一般金屬及礦石,-0.427%,607063762,0.588,614162866,0.647
9,紙及林業產品,-1.219%,62744422,0.061,90573733,0.095
11,化學製品,+0.026%,66205171,0.064,64576580,0.068
13,工業,-0.470%,1256202583,1.218,1400471279,1.476
15,汽車,-0.517%,5190492155,5.032,4236241936,4.464
17,電器及消閒電子產品,+0.217%,785067702,0.761,803705299,0.847
19,紡織、服裝及配飾,+0.529%,2099564287,2.035,2506992725,2.642


In [32]:
# Reset index
HK_industry.reset_index(inplace=True, drop=True)
HK_industry

Unnamed: 0,行業,平均升/跌幅,成交金額,成交金額佔大市%,上日成交額,上日成交額佔大市%
0,石油及天然氣,-0.562%,2463995519,2.389,2822348084,2.974
1,煤炭,-1.712%,1013858374,0.983,1215029237,1.28
2,黃金及貴金屬,+0.370%,425822726,0.413,358771842,0.378
3,一般金屬及礦石,-0.427%,607063762,0.588,614162866,0.647
4,紙及林業產品,-1.219%,62744422,0.061,90573733,0.095
5,化學製品,+0.026%,66205171,0.064,64576580,0.068
6,工業,-0.470%,1256202583,1.218,1400471279,1.476
7,汽車,-0.517%,5190492155,5.032,4236241936,4.464
8,電器及消閒電子產品,+0.217%,785067702,0.761,803705299,0.847
9,紡織、服裝及配飾,+0.529%,2099564287,2.035,2506992725,2.642


In [33]:
# Convert variables from string to float for further processing (if necessary)
HK_industry["成交金額"] = pd.to_numeric(HK_industry["成交金額"], errors="coerce")
HK_industry["成交金額佔大市%"] = pd.to_numeric(HK_industry["成交金額佔大市%"], errors="coerce")
HK_industry["上日成交額"] = pd.to_numeric(HK_industry["上日成交額"], errors="coerce")
HK_industry["上日成交額佔大市%"] = pd.to_numeric(HK_industry["上日成交額佔大市%"], errors="coerce")
HK_industry

Unnamed: 0,行業,平均升/跌幅,成交金額,成交金額佔大市%,上日成交額,上日成交額佔大市%
0,石油及天然氣,-0.562%,2463995519,2.389,2822348084,2.974
1,煤炭,-1.712%,1013858374,0.983,1215029237,1.28
2,黃金及貴金屬,+0.370%,425822726,0.413,358771842,0.378
3,一般金屬及礦石,-0.427%,607063762,0.588,614162866,0.647
4,紙及林業產品,-1.219%,62744422,0.061,90573733,0.095
5,化學製品,+0.026%,66205171,0.064,64576580,0.068
6,工業,-0.470%,1256202583,1.218,1400471279,1.476
7,汽車,-0.517%,5190492155,5.032,4236241936,4.464
8,電器及消閒電子產品,+0.217%,785067702,0.761,803705299,0.847
9,紡織、服裝及配飾,+0.529%,2099564287,2.035,2506992725,2.642


In [34]:
soup = BeautifulSoup(html)
anchor_list = soup.find_all("a", href=re.compile("industry_detail"))
anchor_list

[<a href="industry_detail.php?nature=ONG&amp;subtype=all">石油及天然氣</a>,
 <a href="industry_detail.php?nature=COA&amp;subtype=all">煤炭</a>,
 <a href="industry_detail.php?nature=MIN&amp;subtype=all">黃金及貴金屬</a>,
 <a href="industry_detail.php?nature=MET&amp;subtype=all">一般金屬及礦石</a>,
 <a href="industry_detail.php?nature=BAM&amp;subtype=all">紙及林業產品</a>,
 <a href="industry_detail.php?nature=CHE&amp;subtype=all">化學製品</a>,
 <a href="industry_detail.php?nature=IDG&amp;subtype=all">工業</a>,
 <a href="industry_detail.php?nature=AUT&amp;subtype=all">汽車</a>,
 <a href="industry_detail.php?nature=HGE&amp;subtype=all">電器及消閒電子產品</a>,
 <a href="industry_detail.php?nature=TNC&amp;subtype=all">紡織、服裝及配飾</a>,
 <a href="industry_detail.php?nature=FNB&amp;subtype=all">食物及飲料</a>,
 <a href="industry_detail.php?nature=HNP&amp;subtype=all">醫療保健</a>,
 <a href="industry_detail.php?nature=AGP&amp;subtype=all">農業產品</a>,
 <a href="industry_detail.php?nature=JNW&amp;subtype=all">鐘表珠寶</a>,
 <a href="industry_detail.php?natur

In [35]:
type(anchor_list[0])

bs4.element.Tag

In [36]:
# Getting the attribute from a BeautifulSoup tag
anchor_list[0]["href"]

'industry_detail.php?nature=ONG&subtype=all'

In [37]:
# Getting the text from a BeautifulSoup tag
anchor_list[0].text

'石油及天然氣'

In [38]:
url_front = "http://www.etnet.com.hk/www/tc/stocks/"
link_list = []
industry_name_list = []

for i in range(0, len(anchor_list)):
    
    url_end = anchor_list[i]["href"]
    url = url_front + url_end
    link_list.append(url)
    
    industry_name = anchor_list[i].text
    industry_name_list.append(industry_name)
    
dg = pd.DataFrame()
dg["行業"] = industry_name_list
dg["Hyperlink"] = link_list

dg

Unnamed: 0,行業,Hyperlink
0,石油及天然氣,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=ONG&subtype=all
1,煤炭,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=COA&subtype=all
2,黃金及貴金屬,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=MIN&subtype=all
3,一般金屬及礦石,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=MET&subtype=all
4,紙及林業產品,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=BAM&subtype=all
5,化學製品,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=CHE&subtype=all
6,工業,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=IDG&subtype=all
7,汽車,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=AUT&subtype=all
8,電器及消閒電子產品,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=HGE&subtype=all
9,紡織、服裝及配飾,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=TNC&subtype=all


In [39]:
merged = pd.merge(HK_industry, dg)
merged.sort_values(by=["平均升/跌幅"], ascending=False, inplace=True, ignore_index=True)

In [40]:
merged

Unnamed: 0,行業,平均升/跌幅,成交金額,成交金額佔大市%,上日成交額,上日成交額佔大市%,Hyperlink
0,半導體,-3.052%,741595365,0.719,731755342,0.771,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=SEM&subtype=all
1,醫療保健,-1.788%,11482910474,11.132,5855573101,6.17,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=HNP&subtype=all
2,鐘表珠寶,-1.772%,54910352,0.053,65167622,0.069,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=JNW&subtype=all
3,零售,-1.737%,122387261,0.119,151061036,0.159,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=RET&subtype=all
4,煤炭,-1.712%,1013858374,0.983,1215029237,1.28,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=COA&subtype=all
5,軟件服務,-1.504%,18672332822,18.101,15411998010,16.239,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=SNS&subtype=all
6,綜合企業,-1.390%,590486404,0.572,808553487,0.852,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=CGM&subtype=all
7,紙及林業產品,-1.219%,62744422,0.061,90573733,0.095,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=BAM&subtype=all
8,保險,-1.158%,4088419936,3.963,3487743543,3.675,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=INS&subtype=all
9,其他金融,-0.966%,2676072361,2.594,2074862361,2.186,http://www.etnet.com.hk/www/tc/stocks/industry_detail.php?nature=OTF&subtype=all
