# Simple Web Crawler Implementation

A simple web crawler designed here is composed of 4 main modules:
* <b>Scheduler</b>: maintain a queue of URLs to visit
* <b>Downloader</b>: download web pages
* <b>Analyzer</b>: analyze content and links
* <b>Storage</b>: store content and metadata

## 1) Basic Downloader
Every web crawler should be defined a <i>name</i> and identified its <i>owner</i> (i.e., the '`user-agent`' and '`from`' fields of the headers, respectively). Sometimes, you may get an error message, caused by the connection timeout and the page not found, for instance. You can print '`response.status_code`' to track that problem.

In [None]:
import requests
from requests.exceptions import HTTPError

headers = {
    'User-Agent': '6210506348',
    'From': 'natthakit.n@ku.th'
}
seed_url = 'https://www.ku.ac.th/th/'

def get_page(url):
    global headers
    text = ''
    try:
        response = requests.get(url, headers=headers, timeout=2)
        # If the response was successful, no Exception will be raised
        response.raise_for_status()
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')  # Python 3.6
        # return False
    except Exception as err:
        print(f'Other error occurred: {err}')  # Python 3.6
        # return False
    else:
        print('Success!')
        text = response.text
    return text.lower()

raw_html = get_page(seed_url)
print(raw_html)

## 2) Basic Analyzer
### 2.1 Link Parser
The following code is an example of simple link parser. The program extracts all links by considering the <i>anchor</i> tag only, and stores them into a `urls` list.

In [None]:
def link_parser(raw_html):
    urls = [];
    pattern_start = '<a href="';  pattern_end = '"'
    index = 0;  length = len(raw_html)
    while index < length:
        start = raw_html.find(pattern_start, index)
        if start > 0:
            start = start + len(pattern_start)
            end = raw_html.find(pattern_end, start)
            link = raw_html[start:end]
            if len(link) > 0:
                if link not in urls:
                    urls.append(link)
            index = end
        else:
            break
    return urls

raw_html = '<html><body><a href="http://test1.com">test1</a><br><a href="http://test2.com">test2</a></body></html>'
print(link_parser(raw_html))

### 2.2 URL Normalization
The following code is an example of using the `urljoin()` function to transform a relative URL to the absolute one.

In [None]:
from urllib.parse import urljoin

# Define an absolute (base) URL of a web page
base_url = 'https://mike.cpe.ku.ac.th'

# An example of the extracted absolute link
link_1 = 'http://www.ku.ac.th'
# An example of the extracted relative link
link_2 = 'download/homework.html'

# Resolve links
abs_link_1 = urljoin(base_url, link_1)
abs_link_2 = urljoin(base_url, link_2)

print(abs_link_1)  # -> http://www.ku.ac.th
print(abs_link_2)  # -> https://mike.cpe.ku.ac.th/download/homework.html

## 3) Basic Scheduler
The following code is an example of using a FIFO queue to handle the extracted URLs to be further downloaded. In particular, the main crawling process simply invokes the previous two defined functions, i.e., `get_page()` and `link_parser()`, to first download a web page and extract its out-going links, respectively. Then, all extracted links will be stored into a queue. We define here two queues: `frontier_q` and `visited_q`. The former is used as the FIFO queue to keep URLs for next downloading, while the latter is used to remember which web pages have been already downloaded.

In [None]:
seed_url = 'https://www.ku.ac.th/th/'
frontier_q = [seed_url]
visited_q = []

# param 'links' is a list of extracted links to be stored in the queue
def enqueue(links):
    global frontier_q
    for link in links:
        if link not in frontier_q and link not in visited_q:
            frontier_q.append(urljoin(seed_url,link))

# FIFO queue
def dequeue():
    global frontier_q
    current_url = frontier_q[0]
    frontier_q = frontier_q[1:]
    return current_url

#--- main process ---#
current_url = dequeue()
visited_q.append(current_url)
raw_html = get_page(current_url)
extracted_links = link_parser(raw_html)
enqueue(extracted_links)
print(frontier_q)

## 4) Storing Text into a File
As the following, we use the `os.makedirs()` function to first create (sub)directories. Notice that the `exist_ok=True` parameter is set to prevent an exception error if the target directory already exists. Then, we use the `open()`, `write()`, and `close()` functions to open a file, write some text into that file, and afterwards close it. In addition, we import the `codecs` module together with using the '`utf-8`' encoding for non-English content.

In [None]:
import os, codecs

# Create (sub)directories with the 0o755 permission
# param 'exist_ok' is True for no exception if the target directory already exists
path = 'html/subdir1/subdir2'
os.makedirs(path, 0o755, exist_ok=True)

# Write content into a file
raw_html = '<html><body><a href="http://test1.com">test1</a><br><a href="http://test2.com">test2</a></body></html>'
raw_html = get_page('http://sis.ku.ac.th/')
abs_file = path + '/index2' + '.html'
f = codecs.open(abs_file, 'w', 'utf-8')
f.write(raw_html)
f.close()

In [None]:
from urllib.parse import urlparse

url = 'www.ku.ac.th/th/scholarships?category=120#kuyraisas'
result = urlparse(url)

print(result)
print(result.path)

filepath = 'html/' + result.netloc + result.path[:result.path.rfind('/')]
print(filepath)

filename = result.path[result.path.rfind('/')+1:] 
if result.query != '':
  filename = filename + '?' + result.query
if result.fragment != '':
  filename = filename + '#' + result.fragment
if filename == '':
  filename = 'dummy'

        
print(filename)

# <font color="blue">Your Turn ...</font>
Write a web crawler to collect 10,000 web pages (including only '`.htm`' and '`.html`' files) within the '`ku.ac.th`' domain.

In [None]:
seed_url = 'https://cooking.kapook.com/'

In [None]:
i=0
frontier_q = ['https://nlovecooking.com','https://krua.co','https://cooking.kapook.com','https://cookpad.com/th']
visited_q = []
downloaded = []
KEY_WORD = ('วัตถุดิบ','แคลอรี่','อาหาร','เมนู','ของกิน','กับข้าว','รสชาติ','อร่อย','เครื่องเคียง','ของว่าง','เครื่องดื่ม','ขนม')

In [None]:
from urllib.parse import urlparse

headers = {
    'User-Agent': '6210506348',
    'From': 'natthakit.n@ku.th'
}


# seed_url = 'https://www.ku.ac.th/th/'
# seed_url = 'www.ku.ac.th/?q=th/node/add/pre-register.html'

def link_parser(raw_html):
    urls = [];
    pattern_start = '<a href="';  pattern_end = '"'
    index = 0;  length = len(raw_html)
    while index < length:
        start = raw_html.find(pattern_start, index)
        if start > 0:
            start = start + len(pattern_start)
            end = raw_html.find(pattern_end, start)
            link = raw_html[start:end]
            if len(link) > 0:
                if link not in urls:
                    urls.append(link)
            index = end
        else:
            break
    return urls

def enqueue(links):
    global frontier_q
    for link in links:
        link = urljoin(seed_url,link)
        if link not in frontier_q and link not in visited_q:
            frontier_q.append(link)

def dequeue():
    global frontier_q
    current_url = frontier_q[0]
    frontier_q = frontier_q[1:]
    return current_url           

while True:
    current_url = dequeue()
    if 'download' in current_url or '.pdf' in current_url:
        continue

    # print(visited_q)
    # print(frontier_q)
    
    path = 'html/' + current_url.replace('https://','')
    result = urlparse(current_url)
    filepath = 'html/' + result.netloc + result.path[:result.path.rfind('/')]
    filename = result.path[result.path.rfind('/')+1:] 
    
    if result.query != '':
        # filename = filename + '' + result.query
        continue
    if result.fragment != '':
        # filename = filename + '' + result.fragment
        continue

    if filename == '':
        filename = 'dummy'
           
    if len(filename) > 50 :
        continue

    if '.' in filename:
        if '.html' not in filename or '.htm' not in filename:
            continue

    print('#',i+1)
    visited_q.append(current_url)
    raw_html = get_page(current_url)
    extracted_links = link_parser(raw_html)

    enqueue(extracted_links)

    try:
        os.makedirs(filepath, 0o755, exist_ok=True)
    except:
        continue

    abs_file = filepath + '/' +  filename
    if '.html' not in filename or '.htm' not in filename:
        abs_file = abs_file + '.html'

    try:
        f = codecs.open(abs_file, 'w', 'utf-8')
    except:
        continue
    
    if(sum([1 for x in KEY_WORD if x in raw_html])<3):
        continue

    if 'facebook' in current_url or 'youtube' in current_url or 'google' in current_url or 'instagram' in current_url or 'twitter' in current_url:
        continue

    f.write(raw_html)
    f.close()
    
    print('current_url =',current_url)
    print('filepath =',filepath)
    print('filename =',filename)
    print('abs_file =',abs_file)
    downloaded.append(current_url)

    i+=1
    if i==10000:
        break

  
    
    # print(frontier_q)

In [None]:
frontier_q

In [None]:
with open('./downloaded.txt', 'w') as writefile:
    for d in downloaded:
        writefile.write(d+'\n')

In [None]:
my_file = open("./downloaded.txt", "r")
content = my_file.readlines()
for i in range(len(content)):
    content[i] = content[i].replace('\n','')
len(content)
content

In [None]:
hostname = []
for c in content:
    url = c
    result = urlparse(url)
    result
    h = result.scheme + '://' + result.netloc
    if h not in hostname:
        hostname.append(h)

with open('./hostname.txt', 'w') as writefile:
    for d in hostname:
        writefile.write(d+'\n')

In [None]:
hostname

In [None]:
list_robot = []
list_sitemap = []
list_success_robot = []
for h in hostname:
    hb = h + '/robots.txt'
    raw_html = get_page(hb)
    
    if 'user-agent' in raw_html:
        list_robot.append(h)

    if '' not in raw_html:
        list_success_robot.append(h)

    if 'sitemap' in raw_html:
        list_sitemap.append(h)

In [None]:
list_robot

In [None]:
for lr in list_robot:
    raw_html = get_page(lr + '/robots.txt')
    result = urlparse(lr + '/robots.txt')
    filepath = 'html/' + result.netloc + result.path[:result.path.rfind('/')]
    filename = result.path[result.path.rfind('/')+1:] 

    abs_file = filepath + '/' + filename
    print(abs_file)
    os.makedirs(filepath, 0o755, exist_ok=True)
    f = codecs.open(abs_file, 'w', 'utf-8')
    f.write(raw_html)
    f.close()

In [None]:
with open('./list_robots.txt', 'w') as writefile:
    for d in list_robot:
        writefile.write(d+'\n')

with open('./list_sitemap.txt', 'w') as writefile:
    for d in list_sitemap:
        writefile.write(d+'\n')

In [None]:
!rm -rf html