In [36]:
# https://resources.oreilly.com/examples/9780596529321/tree/master

## Scraping a Page

We can scrape HTML pages using urllib3. 

In [37]:
import urllib3

http = urllib3.PoolManager()
url = 'https://en.wikipedia.org/wiki/Programming_language'
# ! The example link is deprecated.
# 'http://kiwitobes.com/wiki/Programming_language.html'
r = http.request('GET', url)
r.status, r.data[:500]



(200,
 b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Programming language - Wikipedia</title>\n<script>document.documentElement.className=document.documentElement.className.replace(/(^|\\s)client-nojs(\\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Programming_language","wgTitle":"Programming language","wgCurRevisionId":901931155,"wgRevisionId":901931155,"wgArticleId":23015')

## Parsing HTML

Once we obtained the HTML page, use BeautifulSoup to parse the page and get the text as well as the links.

In [38]:
from bs4 import BeautifulSoup

# Parse the HTML content of the page.
soup = BeautifulSoup(r.data, 'html.parser')
body = soup.body
body.get_text()[:50]

'\n\n\n\n\n\n\n\n\nProgramming language\n\nFrom Wikipedia, the'

## Parsing Text

Lowercase all text and split them by white spaces.

In [39]:
import re

def separate_words(text):
    """Separate the words by non-whitespace character."""
    splitter = re.compile('\W+')
    return [s.lower()                     # Lowercase the words
            for s in splitter.split(text) # for each splitted text
            if s != '' and len(s) > 3]   

In [40]:
separate_words(body.get_text())[:20]

['programming',
 'language',
 'from',
 'wikipedia',
 'free',
 'encyclopedia',
 'this',
 'latest',
 'accepted',
 'revision',
 'reviewed',
 'june',
 '2019',
 'jump',
 'navigation',
 'jump',
 'search',
 'language',
 'designed',
 'communicate']

## Get links

To scrape the website recursively, we get all the links and repeat the process above.

In [41]:
links = body.find_all('a')
links[:10]

[<a id="top"></a>,
 <a href="/wiki/Wikipedia:Protection_policy#pending" title="All edits by unregistered and new users are subject to review prior to becoming visible to unregistered users"><img alt="Page protected with pending changes" data-file-height="512" data-file-width="512" decoding="async" height="20" src="//upload.wikimedia.org/wikipedia/en/thumb/b/b7/Pending-protection-shackle.svg/20px-Pending-protection-shackle.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/b/b7/Pending-protection-shackle.svg/30px-Pending-protection-shackle.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/b/b7/Pending-protection-shackle.svg/40px-Pending-protection-shackle.svg.png 2x" width="20"/></a>,
 <a href="/wiki/Wikipedia:Pending_changes" title="Wikipedia:Pending changes">latest accepted revision</a>,
 <a class="external text" href="//en.wikipedia.org/w/index.php?title=Special:Log&amp;type=review&amp;page=Programming_language">reviewed</a>,
 <a class="mw-jump-link" href="#mw-head">Jum

## Rebuilding the links

Note that the links are relative - we need to build back the full url path

In [42]:
from urllib.parse import urljoin

fullurls = set()
for link in links:
    fullurl = urljoin(url, link.get('href'))
    fullurl = fullurl.split('#')[0] # Remove location portion from the URL.
    fullurl = fullurl.split('?')[0] # Remove querystring portion from the URL.
    fullurls.add(fullurl)
    
list(fullurls)[:10]

['https://uz.wikipedia.org/wiki/Programmalash_tili',
 'https://en.wikipedia.org/wiki/Library_(computing)',
 'https://hsb.wikipedia.org/wiki/Program%C4%9Browanske_r%C4%9B%C4%8De',
 'https://en.wikipedia.org/wiki/Undecidable_problem',
 'https://en.wikipedia.org/wiki/Michael_Sipser',
 'http://www.computerweekly.com/Articles/2007/09/11/226631/sslcomputer-weekly-it-salary-survey-finance-boom-drives-it-job.htm',
 'https://en.wikipedia.org/wiki/Troff',
 'http://www.apl.jhu.edu/~hall/Lisp-Notes/Macros.html',
 'https://www.ibm.com/developerworks/library/os-erlang1/index.html',
 'https://en.wikipedia.org/wiki/Formal_language']

In [43]:
import nltk

# Prepare the english stopwords.
stopwords = nltk.corpus.stopwords.words('english')
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [44]:
import re
import urllib3
from bs4 import BeautifulSoup # pip3 install beautifulsoup4
from urllib.parse import urljoin
import sqlite3

http = urllib3.PoolManager()
url = 'https://en.wikipedia.org/wiki/Programming_language'

class Crawler:
    def __init__(self, db):
        """Initialize the crawler with the name of database."""
        self.conn = sqlite3.connect(db)
    
    def __del__(self):
        """Terminates the database connection."""
        self.conn.close()
    
    def get_entry_id(self, table, field, value, create_new = True):
        """Get an entry_id from a table, or create it if it does not exist."""
        c = self.conn.cursor()
        
        # ! We can't use dynamic table name for sqlite3.
        res = c.execute(f'select rowid from {table} where {field} = ?', (value,)).fetchone()
        
        # The entry exists, returns the rowid.
        if res != None:
            # Execute returns a single sequence, or None.
            return res[0]

        # Else, insert the entry.
        r = c.execute(f'insert into {table} ({field}) values (?)', (value,))
        self.conn.commit()
        
        # Get the inserted rowid.
        return r.lastrowid

    def add_to_index(self, url, soup):
        """Index an individual page."""
        # Skip if the page has been indexed.
        if self.is_indexed(url): return
        
        print(f'indexing {url}')
        
        # Get the individual words.
        words = self.separate_words(soup.get_text())
        
        # Get the URL id which matches the current url in the database.
        url_id = self.get_entry_id('urllist', 'url', url)
        
        # Link each word to this url.
        for i, word in enumerate(words):
            word_id = self.get_entry_id('wordlist', 'word', word)
            stmt = 'insert into wordlocation(url_id, word_id, location) values (?, ?, ?)'
            self.conn.execute(stmt, (url_id, 
                                     word_id, 
                                     i))

    def separate_words(self, text):
        """Separate the words by non-whitespace character."""
        splitter = re.compile('\W+')
        return [s.lower()                     # Lowercase the words
                for s in splitter.split(text) # for each splitted text
                if len(s) > 3                 # at least 3 characters
                and s not in stopwords]    # and not in the stopwords list.
    
    # Return true if this url is already indexed.
    def is_indexed(self, url):
        c = self.conn.cursor()
        res = c.execute('select url from urllist where url = ?', (url,)).fetchone()
        if res != None:
            url = res[0]
            # Check if it has actually been crawled.
            res = c.execute('select * from wordlocation where url_id = ?', (url,)).fetchone()
            if res != None: return True
        return False
    
    # Add a link between pages.
    def add_link_ref(self, url_from, url_to, link_text):
        words = self.separate_words(link_text)
        from_id = self.get_entry_id('urllist', 'url', url_from)
        to_id = self.get_entry_id('urllist', 'url', url_to)
        if from_id == to_id: return
        
        c = self.conn.cursor()
        res = c.execute('insert into link(from_id,to_id) values (?, ?)', (from_id, to_id))
        link_id = res.lastrowid
        for word in words:
            if word in stopwords: continue
            word_id = self.get_entry_id('wordlist', 'word', word)
            c.execute('insert into linkwords(link_id, word_id) values (?, ?)', (link_id, word_id))
        self.conn.commit()

    # Starting with a list of pages, do a breadth first search to the given depth,
    # indexing pages as we go.
    def crawl(self, pages, depth = 2):
        # count = 0
        # TODO: Implement max count so that the page doesn't crawl indefinitely.
        # max_count = 1000
        visited = set()
        for i in range(depth):
            new_pages = set()
            for page in pages:
                
                if page in visited: continue # Skip visited page.
                visited.add(page)            # Set page as visited.

                try:
                    r = http.request('GET', page)
                    # r.status, r.data 
                    print(f'loaded page {page} {r.status}')
                except:
                    print(f'could not open page {page}')
                    continue
                
                # Parse the HTML content of the page.
                soup = BeautifulSoup(r.data, 'html.parser')
                
                # Index the HTML page. Take only the body.
                self.add_to_index(page, soup.body)
                
                # Find all the href links in the page.
                links = soup.body.find_all('a')
                
                # For each link, rebuild the full URL based on the base URL.
                for link in links:
                    url = urljoin(page, link.get('href'))
                    url = url.split('#')[0] # Remove location portion from the URL.
                    url = url.split('?')[0] # Remove querystring portion from the URL.
                    if url in visited: continue # Skip if visited.

                    # If the link starts with http (presumably a valid link) and it's not yet indexed...
                    if url[0:4] == 'http' and not self.is_indexed(url):
                        new_pages.add(url) # Add to the list of URLs to scrape.
                    link_text = link.get_text() # Get the link text without the tags.
                    
                    # Add the reference from the link to the text.
                    self.add_link_ref(page, url, link_text)
                self.conn.commit()
            pages = new_pages
                
    # Create the database tables.
    def create_index_tables(self):
        self.conn.execute('create table if not exists urllist(url)')
        self.conn.execute('create table if not exists wordlist(word)')
        self.conn.execute('create table if not exists wordlocation(url_id, word_id, location)')
        self.conn.execute('create table if not exists link(from_id integer, to_id integer)')
        self.conn.execute('create table if not exists linkwords(word_id, link_id)')
        self.conn.execute('create index if not exists wordidx on wordlist(word)')
        self.conn.execute('create index if not exists urlidx on urllist(url)')
        self.conn.execute('create index if not exists wordurlidx on wordlocation(word_id)')
        self.conn.execute('create index if not exists urltoidx on link(to_id)')
        self.conn.execute('create index if not exists urlfromidx on link(from_id)')
        self.conn.commit()

In [45]:
crawler = Crawler('searchindex.db')
crawler.create_index_tables()

In [46]:
%skip
crawler.crawl([
    'https://en.wikipedia.org/wiki/Programming_language',
    'https://en.wikipedia.org/wiki/Categorical_list_of_programming_languages.html',
    'https://en.wikipedia.org/wiki/Functional_programming'
])

UsageError: Line magic function `%skip` not found.


In [47]:
# [row for row in crawler.conn.execute('select rowid from wordlocation where word_id = 1')][:10]
crawler.conn.execute('select count(rowid) from wordlocation where word_id = 1').fetchone()

(1798,)

In [48]:
class Searcher:
    def __init__(self, db):
        self.conn = sqlite3.connect(db)
    
    def __del__(self):
        self.conn.close()

    def get_match_rows(self, q):
        # Strings to build the query.
        fieldlist = 'w0.url_id'
        tablelist = ''
        clauselist = ''
        wordids = []
        
        # Split the words by spaces.
        words = q.split(' ')
        tablenumber = 0
        
        for word in words:
            # Get the word id.
            c = self.conn.cursor()
            wordrow = c.execute('select rowid from wordlist where word=?', (word,)).fetchone()
            if wordrow != None:
                wordid = wordrow[0]
                wordids.append(wordid)
                if tablenumber > 0:
                    tablelist += ','
                    clauselist += ' and '
                    clauselist += f'w{tablenumber-1}.url_id=w{tablenumber}.url_id and '
                fieldlist += f',w{tablenumber}.location'
                tablelist += f'wordlocation w{tablenumber}'
                clauselist += f'w{tablenumber}.word_id = {wordid}'
                tablenumber += 1
        # Create the query from the separate parts.
        fullquery = f'select {fieldlist} from {tablelist} where {clauselist}'
        print(fullquery)
        c = self.conn.cursor()
        res = c.execute(fullquery)
        rows = [row for row in res]
        return rows, wordids
    
    def get_scored_list(self, rows, word_ids):
        total_scores = dict([(row[0], 0) for row in rows])
        
        # Put the scoring function here.
        # weights = []
        weights = [(1.0, self.frequency_score(rows)),
                   (1.5, self.location_score(rows)),
                   (1.0, self.page_rank_score(rows)),
                   (1.0, self.link_text_score(rows, word_ids))]
        
        for (weight, scores) in weights:
            for url in total_scores:
                total_scores[url] += weight * scores[url]
        
        return total_scores

    def get_url_name(self, id):
        return self.conn.cursor().execute('select url from urllist where rowid = ?', (id,)).fetchone()[0]
    
    def query(self, q, n=10):
        rows, word_ids = self.get_match_rows(q)
        scores = self.get_scored_list(rows, word_ids)
        ranked_scores = sorted([(score, url) 
                                for (url, score) 
                                in scores.items()], 
                               reverse=True)

        for (score, url_id) in ranked_scores[0:n]:
            print(f'{score}\t{self.get_url_name(url_id)}')
    
    def normalize_scores(self, scores, small_is_better=False):
        """
        Sometimes a smaller score is better, and vice versa. The normalization
        function will take a dictionary of IDs and scores and return a new dictionary with the same IDs,
        but with score between 0 and 1. Each score is scaled according to how close it is to the best 
        result, which will always have a score of 1.
        """
        vsmall = 0.00001 # Avoid division by zero errors.
        if small_is_better:
            min_score = min(scores.values())
            return dict([(u, float(min_score)/max(vsmall,1))
                        for (u, l) in scores.items()])
        else:
            max_score = max(scores.values())
            if max_score == 0: max_score = vsmall
            return dict([(u, float(c) / max_score) for (u, c) in scores.items()])
    
    def frequency_score(self, rows):
        """
        The word frequency scores a page based on how many times the words in the query appear on that page.
        """
        counts = dict([(row[0], 0) for row in rows])
        
        # Create a dictionary with an entry for each unique url id, and count how many times the item appears.
        for row in rows: counts[row[0]] += 1
            
        # Normalize the scores, in this case, bigger is better (occur more frequently).
        return self.normalize_scores(counts)

    def location_score(self, rows):
        """
        Score the page based on the search term location in the page. 
        If a page is relevant to the search term, it will appear closer to the top.
        The search engine can score results higher if the query term appears early
        in the document."""
        locations = dict([(row[0], 1000000) for row in rows])
        for row in rows:
            loc = sum(row[1:])
            if loc < locations[row[0]]: locations[row[0]] = loc
        # The lowest location score (closes to the start) will get a score of 1.
        return self.normalize_scores(locations, small_is_better=True)

    def distance_score(self, rows):
        """
        When a query contains multiple words, 
        it is often useful to seek results in which the words in 
        the query are close to each other in the page.
        """
        # If there are only one word, everyone wins!
        if len(rows[0]) <= 2: return dict([(row[0], 1.0) for row in rows])
        
        # Initialize the dictionary with large values.
        min_distance = dict([(row[0], 1000000) for row in rows])
        for row in rows:
            dist = sum([abs(row[i] - row[i-1])
                       for i in range(2, len(row))])
            if dist < min_distance[row[0]]: min_distance[row[0]] = dist
        # The smaller the distance, means the more similar the results are.
        return self.normalize_scores(min_distance, True)

    def inbound_link_score(self, rows):
        """
        Count the inbound links on the page and use the total number of links as a metric for the page.
        """
        unique_urls = set([row[0] for row in rows])
        inbound_count = dict([(u, self.conn.cursor().execute('select count(*) from link where to_id = ?', (u,)).fetchone()[0]) for u in unique_urls])
        return self.normalize_scores(inbound_count)
    
    def calculate_page_rank(self, iterations=20):
        # Clear out the current page rank tables.
        c = self.conn.cursor()
        c.execute('drop table if exists pagerank')
        c.execute('create table pagerank(url_id primary key, score)')
        
        # Initialize every query with a PageRank of 1.
        c.execute('insert into pagerank select rowid, 1.0 from urllist')
        self.conn.commit()
        
        for i in range(iterations):
            print(f'iteration {i}')
            
            for (url_id,) in c.execute('select rowid from urllist'):
                pr = 0.15
                
                # Loop through all the pages that link to this one.
                for (linker,) in c.execute(
                'select distinct from_id from link where to_id = ?', (url_id,)):
                    # Get the page rank of the linker.
                    linkingpr = c.execute(
                    'select score from pagerank where url_id = ?', (linker,)).fetchone()[0]
                    
                    # Get the total number of links from the linker.
                    linking_count = c.execute(
                    'select count(*) from link where from_id = ?', (linker,)).fetchone()[0]
                    
                    pr + 0.85 * (linkingpr / linking_count)
                
                c.execute(
                'update pagerank set score = ? where url_id = ?', (pr, url_id))
            self.conn.commit()
    
    def page_rank_score(self, rows):
        c = self.conn.cursor()
        page_ranks = dict([(row[0], c.execute('select score from pagerank where url_id = ?', (row[0],)).fetchone()[0]) for row in rows])
        max_rank = max(page_ranks.values())
        normalized_scores = dict([(u, float(1)/max_rank) for (u,l) in page_ranks.items()])
        return normalized_scores
    
    def link_text_score(self, rows, word_ids):
        """
        Score the page based on the text of the links to a page to decide how relevant the page is.
        """
        link_scores = dict([(row[0],0) for row in rows])
        c = self.conn.cursor()
        for word_id in word_ids:
            cur = c.execute('select link.from_id, link.to_id from linkwords, link where word_id = ? and linkwords.link_id = link.rowid', (word_id,))
            
            for (from_id, to_id) in cur:
                if to_id in link_scores:
                    pr = c.execute("""
                    select score 
                    from pagerank 
                    where url_id = ?""", (from_id,)).fetchone()[0]
                    link_scores[to_id] += pr
        max_score = max(link_scores.values())
        vsmall = 0.00001 # Avoid division by zero errors.
        if max_score == 0: max_score = vsmall
        normalized_scores = dict([(u, float(l)/max_score) for (u,l) in link_scores.items()])
        return normalized_scores

    # import nn
    # net = nn.searchnet("nn.db")
#     def nn_score(self, rows, wordids):
#         # Get unique url ids as an ordered list.
#         urlids = [urlid for urlif in set([row[0] for row in rows])]
#         nnres = net.getresult(wordids, urlids)
#         scores = dict([(urlids[i], nnres[i]) for i in range(len(urlids))])
#         return self.normalize_scores(scores)

In [49]:
engine = Searcher('searchindex.db')
# engine.get_match_rows('functional programming')
engine.query('functional programming')

select w0.url_id,w0.location,w1.location from wordlocation w0,wordlocation w1 where w0.word_id = 432 and w0.url_id=w1.url_id and w1.word_id = 1
query.scores {1: 3.1166107382550337, 505: 2.505838926174497, 105: 4.5}
4.5	https://en.wikipedia.org/wiki/Functional_programming
3.1166107382550337	https://en.wikipedia.org/wiki/Programming_language
2.505838926174497	https://en.wikipedia.org/wiki/Objective-C


In [50]:
engine.calculate_page_rank()

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19


In [52]:
engine.query('dynamic programming')

select w0.url_id,w0.location,w1.location from wordlocation w0,wordlocation w1 where w0.word_id = 124 and w0.url_id=w1.url_id and w1.word_id = 1
query.scores {1: 69.5, 505: 68.69885714285714, 105: 69.52176870748299}
69.52176870748299	https://en.wikipedia.org/wiki/Functional_programming
69.5	https://en.wikipedia.org/wiki/Programming_language
68.69885714285714	https://en.wikipedia.org/wiki/Objective-C


In [None]:
[row for row in engine.conn.cursor().execute('select * from wordlist limit 10')]

## Content-based Ranking

- word frequency: the number of times the words in the query appear in the document can help determine how relevant the document is
- document location: the main subject of a document will probably appear near the beginning of the document
- word distance: if there are multiple words in the query, they should appear close together in the document
