In [4]:
# 4 Major Python Libraries for Web Crawling
# (1) Pandas - Parsing HTML Tables
# (2) Request - Parsing HTML Codes
# (3) BeautifulSoup - Analyzing HTML Codes
# (4) Selenium - Automating Browser Activities

# All labs in these lessons are meant for demonstrating web crawling techniques only.
# Please Google and try to understand in details the ethics and best practice for web crawling.
# e.g. https://sunscrapers.com/blog/web-crawling-scraping-best-practices/

In [15]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import random
import time

In [5]:
driver = webdriver.Chrome('./chromedriver')
url = "https://hkbeautyhub.com/page/1/?post_type=hp_listing&_category&s"
driver.get(url)
html = driver.page_source
t = random.uniform(2, 5)
time.sleep(t)
soup = BeautifulSoup(html)
title_table = soup.find_all("h4", {"class":"hp-listing__title"})
driver.quit()
title_table

[<h4 class="hp-listing__title"><a href="https://hkbeautyhub.com/listing-shop/two-beauty/">Two Beauty</a>
 <i class="hp-listing__verified-badge hp-listing__verified hp-icon fas fa-check-circle" title="Verified"></i>
 </h4>,
 <h4 class="hp-listing__title"><a href="https://hkbeautyhub.com/listing-shop/glamour-beauty-pro/">Glamour Beauty Pro</a>
 <i class="hp-listing__verified-badge hp-listing__verified hp-icon fas fa-check-circle" title="Verified"></i>
 </h4>,
 <h4 class="hp-listing__title"><a href="https://hkbeautyhub.com/listing-shop/eternal-beaute/">Eternal Beaute</a>
 <i class="hp-listing__verified-badge hp-listing__verified hp-icon fas fa-check-circle" title="Verified"></i>
 </h4>,
 <h4 class="hp-listing__title"><a href="https://hkbeautyhub.com/listing-shop/mm-nail-eyelashes-beauty/">MM nail eyelashes beauty</a>
 <i class="hp-listing__verified-badge hp-listing__verified hp-icon fas fa-check-circle" title="Verified"></i>
 </h4>,
 <h4 class="hp-listing__title"><a href="https://hkbeauty

In [6]:
full_array = []
driver = webdriver.Chrome('./chromedriver')
for i in range(1, 10):
    print(i)
    url = "https://hkbeautyhub.com/page/" + str(i) + "/?post_type=hp_listing&_category&s"
    driver.get(url)
    html = driver.page_source
    t = random.uniform(2, 5)
    time.sleep(t)
    soup = BeautifulSoup(html)
    title_table = soup.find_all("h4", {"class":"hp-listing__title"})
    full_array = full_array + title_table
driver.quit()
#full_array

1
2
3
4
5
6
7
8
9


In [7]:
type(full_array)

list

In [8]:
full_array[0]

<h4 class="hp-listing__title"><a href="https://hkbeautyhub.com/listing-shop/two-beauty/">Two Beauty</a>
<i class="hp-listing__verified-badge hp-listing__verified hp-icon fas fa-check-circle" title="Verified"></i>
</h4>

In [9]:
type(full_array[0])

bs4.element.Tag

In [10]:
full_array[0].contents

[<a href="https://hkbeautyhub.com/listing-shop/two-beauty/">Two Beauty</a>,
 '\n',
 <i class="hp-listing__verified-badge hp-listing__verified hp-icon fas fa-check-circle" title="Verified"></i>,
 '\n']

In [11]:
full_array[0].contents[0]

<a href="https://hkbeautyhub.com/listing-shop/two-beauty/">Two Beauty</a>

In [12]:
full_array[0].contents[0]["href"]

'https://hkbeautyhub.com/listing-shop/two-beauty/'

In [13]:
hyperlink = []
for i in range(0, len(full_array)):
    hyperlink = hyperlink + [full_array[i].contents[0]["href"]]
hyperlink

['https://hkbeautyhub.com/listing-shop/two-beauty/',
 'https://hkbeautyhub.com/listing-shop/glamour-beauty-pro/',
 'https://hkbeautyhub.com/listing-shop/eternal-beaute/',
 'https://hkbeautyhub.com/listing-shop/mm-nail-eyelashes-beauty/',
 'https://hkbeautyhub.com/listing-shop/giho/',
 'https://hkbeautyhub.com/listing-shop/2/',
 'https://hkbeautyhub.com/listing-shop/%e9%8f%a2%e9%9d%b6%e5%bc%8f%e6%b2%bb%e7%99%82%e5%b0%88%e5%ae%b6/',
 'https://hkbeautyhub.com/listing-shop/my-perfect-skin/',
 'https://hkbeautyhub.com/listing-shop/%e6%82%85%e9%ba%97%e9%96%a3%e7%be%8e%e5%ae%b9%e4%b8%ad%e5%bf%83/',
 'https://hkbeautyhub.com/listing-shop/maryland-beauty/',
 'https://hkbeautyhub.com/listing-shop/starnail-%e6%98%9f%e7%b4%9a%e7%be%8e%e7%94%b2/',
 'https://hkbeautyhub.com/listing-shop/rainbow-tattoo-beauty/',
 'https://hkbeautyhub.com/listing-shop/%e8%8f%b2%e8%8f%b2%e7%be%8e%e7%94%b2%e5%ba%97/',
 'https://hkbeautyhub.com/listing-shop/%e8%8e%8e%e8%8e%89/',
 'https://hkbeautyhub.com/listing-shop/%e5

In [16]:
df = pd.DataFrame(hyperlink, columns=["Hyperlink"])
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df

Unnamed: 0,Hyperlink
0,https://hkbeautyhub.com/listing-shop/two-beauty/
1,https://hkbeautyhub.com/listing-shop/glamour-b...
2,https://hkbeautyhub.com/listing-shop/eternal-b...
3,https://hkbeautyhub.com/listing-shop/mm-nail-e...
4,https://hkbeautyhub.com/listing-shop/giho/
...,...
193,https://hkbeautyhub.com/listing-shop/388-nails/
194,https://hkbeautyhub.com/listing-shop/slim-beau...
195,https://hkbeautyhub.com/listing-shop/lets-beau...
196,https://hkbeautyhub.com/listing-shop/tip-ntoe-hk/


In [139]:
df.to_excel("Beauty_Hyperlinks.xlsx", index=False)