In [1]:
# https://resources.oreilly.com/examples/9780596529321/tree/master

class crawler:
    # Initialize the crawler with the name of database.
    def __init__(self, db):
        pass
    
    def __del__(self):
        pass

    def commit(self):
        pass
    
    # Auxialliary function for getting an entry id and adding it if it is not present.
    def get_entry_id(self, table, field, value, create_new = True):
        return None
    
    # Index an individual page.
    def add_to_index(self, url, soup):
        print(f'indexing {url}')
    
    # Extract the text from the HTML page (no tags).
    def get_text_only(self, soup):
        return None
    
    # Separate the words by non-whitespace character.
    def separate_words(self, text):
        return None
    
    # Return true if this url is already indexed.
    def is_indexed(self, url):
        return False
    
    # Add a link between pages.
    def add_link_ref(self, url_from, url_to, link_text):
        pass

    # Starting with a list of pages, do a breadth first search to the given depth,
    # indexing pages as we go.
    def crawl(self, pages, depth = 2):
        pass
    
    # Create the database tables.
    def create_index_tables(self):
        pass

In [2]:
import urllib3
http = urllib3.PoolManager()
url = 'https://en.wikipedia.org/wiki/Programming_language'
# 'http://kiwitobes.com/wiki/Programming_language.html'
r = http.request('GET', url)
r.status, r.data[:500]



(200,
 b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Programming language - Wikipedia</title>\n<script>document.documentElement.className=document.documentElement.className.replace(/(^|\\s)client-nojs(\\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Programming_language","wgTitle":"Programming language","wgCurRevisionId":900725449,"wgRevisionId":900725449,"wgArticleId":23015')

In [3]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [4]:
import re
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import sqlite3

http = urllib3.PoolManager()
url = 'https://en.wikipedia.org/wiki/Programming_language'
# pip3 install beautifulsoup4

class Crawler:
    # Initialize the crawler with the name of database.
    def __init__(self, db):
        self.conn = sqlite3.connect(db)
    
    def __del__(self):
        self.conn.close()

    def commit(self):
        self.conn.commit()
    
    # Auxialliary function for getting an entry id and adding it if it is not present.
    def get_entry_id(self, table, field, value, create_new = True):
        c = self.conn.cursor()
        res = c.execute(f'select rowid from {table} where {field} = ?', (value,)).fetchone()
        if res == None:
            stmt = f'insert into {table} ({field}) values (?)'
            r = self.conn.execute(stmt, (value,))
            return r.lastrowid
        else:
            return res[0]
    
    # Index an individual page.
    def add_to_index(self, url, soup):
        if self.is_indexed(url): return
        print(f'indexing {url}')
        
        # Get the individual words.
        text = self.get_text_only(soup)
        words = self.separate_words(text)
        
        # Get the URL id.
        url_id = self.get_entry_id('urllist', 'url', url)
        
        # Link each word to this url.
        for i in range(len(words)):
            word = words[i]
            if word in stopwords: continue
            word_id = self.get_entry_id('wordlist', 'word', word)
            self.conn.execute('insert into wordlocation(url_id, word_id, location) values (?, ?, ?)', (url_id, word_id, i))
    
    # Extract the text from the HTML page (no tags).
    def get_text_only(self, soup):
        return soup.get_text()
#         v = soup.string
#         if v == None:
#             c = soup.contents
#             result_text = ''
#             for t in c:
#                 subtext = self.get_text_only(t)
#                 result_text += subtext + '\n'
#             return result_text
#         else:
#             return v.strip()
    
    # Separate the words by non-whitespace character.
    def separate_words(self, text):
        # splitter = re.compile('\\W*')
        splitter=re.compile('\W+')
        return [s.lower() for s in splitter.split(text) if s != '']
    
    # Return true if this url is already indexed.
    def is_indexed(self, url):
        cur = self.conn.cursor()
        res = cur.execute('select url from urllist where url = ?', (url,)).fetchone()
        if res != None:
            # Check if it has actually been crawled.
            cur = self.conn.cursor()
            c = cur.execute('select * from wordlocation where url_id = ?', (res[0],))
            if c != None:
                res = c.fetchone()
                if res != None: return True
        return False
    
    # Add a link between pages.
    def add_link_ref(self, url_from, url_to, link_text):
        words = self.separate_words(link_text)
        from_id = self.get_entry_id('urllist', 'url', url_from)
        to_id = self.get_entry_id('urllist', 'url', url_to)
        if from_id == to_id: return
        c = self.conn.cursor()
        res = c.execute('insert into link(from_id,to_id) values (?, ?)', (from_id, to_id))
        link_id = res.lastrowid
        for word in words:
            if word in stopwords: continue
            word_id = self.get_entry_id('wordlist', 'word', word)
            self.conn.execute('insert into linkwords(link_id, word_id) values (?, ?)', (link_id, word_id))
    # Starting with a list of pages, do a breadth first search to the given depth,
    # indexing pages as we go.
    def crawl(self, pages, depth = 2):
        for i in range(depth):
            new_pages = set()
            for page in pages:
                try:
                    r = http.request('GET', page)
                    # r.status, r.data 
                    print(f'loaded page {page} {r.status}')
                except:
                    print(f'could not open page {page}')
                    continue
                soup = BeautifulSoup(r.data, 'html.parser')
                self.add_to_index(page, soup)
                
                links = soup.find_all('a')
                for link in links:
                    url = urljoin(page, link.get('href'))
                    # if url.find("'") != -1: continue
                    url = url.split('#')[0] # Remove location portion.
                    if url[0:5] == 'https' and not self.is_indexed(url):
                        print(f'indexing {url}')
                        new_pages.add(url)
                    link_text = self.get_text_only(link)
                    self.add_link_ref(page, url, link_text)
                self.commit()
            pages = new_pages
                
    # Create the database tables.
    def create_index_tables(self):
        self.conn.execute('create table if not exists urllist(url)')
        self.conn.execute('create table if not exists wordlist(word)')
        self.conn.execute('create table if not exists wordlocation(url_id, word_id, location)')
        self.conn.execute('create table if not exists link(from_id integer, to_id integer)')
        self.conn.execute('create table if not exists linkwords(word_id, link_id)')
        self.conn.execute('create index if not exists wordidx on wordlist(word)')
        self.conn.execute('create index if not exists urlidx on urllist(url)')
        self.conn.execute('create index if not exists wordurlidx on wordlocation(word_id)')
        self.conn.execute('create index if not exists urltoidx on link(to_id)')
        self.conn.execute('create index if not exists urlfromidx on link(from_id)')
        self.commit()

In [5]:
crawler = Crawler('searchindex.db')
crawler.create_index_tables()

In [6]:
%skip
crawler.crawl([
    'https://en.wikipedia.org/wiki/Programming_language',
    'https://en.wikipedia.org/wiki/Categorical_list_of_programming_languages.html',
    'https://en.wikipedia.org/wiki/Functional_programming'
])

UsageError: Line magic function `%skip` not found.


In [7]:
[row for row in crawler.conn.execute('select rowid from wordlocation where word_id = 1')]

[(1,),
 (25,),
 (118,),
 (121,),
 (461,),
 (489,),
 (497,),
 (508,),
 (512,),
 (515,),
 (528,),
 (580,),
 (587,),
 (606,),
 (622,),
 (745,),
 (757,),
 (770,),
 (776,),
 (811,),
 (823,),
 (835,),
 (840,),
 (847,),
 (857,),
 (866,),
 (880,),
 (926,),
 (941,),
 (946,),
 (988,),
 (1003,),
 (1032,),
 (1037,),
 (1041,),
 (1063,),
 (1078,),
 (1087,),
 (1101,),
 (1175,),
 (1184,),
 (1213,),
 (1217,),
 (1225,),
 (1293,),
 (1317,),
 (1399,),
 (1425,),
 (1456,),
 (1498,),
 (1501,),
 (1542,),
 (1545,),
 (1551,),
 (1574,),
 (1584,),
 (1599,),
 (1612,),
 (1626,),
 (1632,),
 (1646,),
 (1648,),
 (1667,),
 (1676,),
 (1681,),
 (1695,),
 (1702,),
 (1722,),
 (1748,),
 (1774,),
 (1803,),
 (1836,),
 (1838,),
 (1862,),
 (1866,),
 (1880,),
 (1883,),
 (1899,),
 (1930,),
 (1950,),
 (1956,),
 (1973,),
 (2012,),
 (2233,),
 (2240,),
 (2396,),
 (2417,),
 (2478,),
 (2492,),
 (2512,),
 (2624,),
 (2854,),
 (3030,),
 (3115,),
 (3246,),
 (3281,),
 (3315,),
 (3318,),
 (3337,),
 (3349,),
 (3415,),
 (3426,),
 (3434,),
 (34

In [31]:
class Searcher:
    def __init__(self, db):
        self.conn = sqlite3.connect(db)
    
    def __del__(self):
        self.conn.close()
    
    def commit(self):
        self.conn.commit()
    
    def get_match_rows(self, q):
        # Strings to build the query.
        fieldlist = 'w0.url_id'
        tablelist = ''
        clauselist = ''
        wordids = []
        
        # Split the words by spaces.
        words = q.split(' ')
        tablenumber = 0
        
        for word in words:
            # Get the word id.
            c = self.conn.cursor()
            wordrow = c.execute('select rowid from wordlist where word=?', (word,)).fetchone()
            if wordrow != None:
                wordid = wordrow[0]
                wordids.append(wordid)
                if tablenumber > 0:
                    tablelist += ','
                    clauselist += ' and '
                    clauselist += f'w{tablenumber-1}.url_id=w{tablenumber}.url_id and '
                fieldlist += f',w{tablenumber}.location'
                tablelist += f'wordlocation w{tablenumber}'
                clauselist += f'w{tablenumber}.word_id = {wordid}'
                tablenumber += 1
        # Create the query from the separate parts.
        fullquery = f'select {fieldlist} from {tablelist} where {clauselist}'
        print(fullquery)
        c = self.conn.cursor()
        res = c.execute(fullquery)
        rows = [row for row in res]
        return rows, wordids
    
    def get_scored_list(self, rows, word_ids):
        total_scores = dict([(row[0], 0) for row in rows])
        
        # Put the scoring function here.
        # weights = []
        weights = [(1.0, self.frequency_score(rows)),
                   (1.5, self.location_score(rows)),
                   (1.0, self.page_rank_score(rows)),
                   (1.0, self.link_text_score(rows, word_ids))]
        
        for (weight, scores) in weights:
            for url in total_scores:
                total_scores[url] += weight * scores[url]
        
        return total_scores

    def get_url_name(self, id):
        return self.conn.cursor().execute('select url from urllist where rowid = ?', (id,)).fetchone()[0]
    
    def query(self, q):
        rows, word_ids = self.get_match_rows(q)
        scores = self.get_scored_list(rows, word_ids)
        ranked_scores = sorted([(score, url) for (url, score) in scores.items()], reverse=True)
        for (score, url_id) in ranked_scores[0:10]:
            print(f'{score}\t{self.get_url_name(url_id)}')
    
    def normalize_scores(self, scores, small_is_better=False):
        """
        Sometimes a smaller score is better, and vice versa. The normalization
        function will take a dictionary of IDs and scores and return a new dictionary with the same IDs,
        but with score between 0 and 1. Each score is scaled according to how close it is to the best 
        result, which will always have a score of 1.
        """
        vsmall = 0.00001 # Avoid division by zero errors.
        if small_is_better:
            min_score = min(scores.values())
            return dict([(u, float(min_score)/max(vsmall,1))
                        for (u, l) in scores.items()])
        else:
            max_score = max(scores.values())
            if max_score == 0: max_score = vsmall
            return dict([(u, float(c) / max_score) for (u, c) in scores.items()])
    
    def frequency_score(self, rows):
        """
        The word frequency scores a page based on how many times the words in the query appear on that page.
        """
        counts = dict([(row[0], 0) for row in rows])
        
        # Create a dictionary with an entry for each unique url id, and count how many times the item appears.
        for row in rows: counts[row[0]] += 1
            
        # Normalize the scores, in this case, bigger is better (occur more frequently).
        return self.normalize_scores(counts)

    def location_score(self, rows):
        """
        Score the page based on the search term location in the page. 
        If a page is relevant to the search term, it will appear closer to the top.
        The search engine can score results higher if the query term appears early
        in the document."""
        locations = dict([(row[0], 1000000) for row in rows])
        for row in rows:
            loc = sum(row[1:])
            if loc < locations[row[0]]: locations[row[0]] = loc
        # The lowest location score (closes to the start) will get a score of 1.
        return self.normalize_scores(locations, small_is_better=True)

    def distance_score(self, rows):
        """
        When a query contains multiple words, 
        it is often useful to seek results in which the words in 
        the query are close to each other in the page.
        """
        # If there are only one word, everyone wins!
        if len(rows[0]) <= 2: return dict([(row[0], 1.0) for row in rows])
        
        # Initialize the dictionary with large values.
        min_distance = dict([(row[0], 1000000) for row in rows])
        for row in rows:
            dist = sum([abs(row[i] - row[i-1])
                       for i in range(2, len(row))])
            if dist < min_distance[row[0]]: min_distance[row[0]] = dist
        # The smaller the distance, means the more similar the results are.
        return self.normalize_scores(min_distance, True)

    def inbound_link_score(self, rows):
        """
        Count the inbound links on the page and use the total number of links as a metric for the page.
        """
        unique_urls = set([row[0] for row in rows])
        inbound_count = dict([(u, self.conn.cursor().execute('select count(*) from link where to_id = ?', (u,)).fetchone()[0]) for u in unique_urls])
        return self.normalize_scores(inbound_count)
    
    def calculate_page_rank(self, iterations=20):
        # Clear out the current page rank tables.
        c = self.conn.cursor()
        c.execute('drop table if exists pagerank')
        c.execute('create table pagerank(url_id primary key, score)')
        
        # Initialize every query with a PageRank of 1.
        c.execute('insert into pagerank select rowid, 1.0 from urllist')
        self.commit()
        
        for i in range(iterations):
            print(f'iteration {i}')
            
            for (url_id,) in c.execute('select rowid from urllist'):
                pr = 0.15
                
                # Loop through all the pages that link to this one.
                for (linker,) in c.execute(
                'select distinct from_id from link where to_id = ?', (url_id,)):
                    # Get the page rank of the linker.
                    linkingpr = c.execute(
                    'select score from pagerank where url_id = ?', (linker,)).fetchone()[0]
                    
                    # Get the total number of links from the linker.
                    linking_count = c.execute(
                    'select count(*) from link where from_id = ?', (linker,)).fetchone()[0]
                    
                    pr + 0.85 * (linkingpr / linking_count)
                
                c.execute(
                'update pagerank set score = ? where url_id = ?', (pr, url_id))
            self.commit()
    
    def page_rank_score(self, rows):
        c = self.conn.cursor()
        page_ranks = dict([(row[0], c.execute('select score from pagerank where url_id = ?', (row[0],)).fetchone()[0]) for row in rows])
        max_rank = max(page_ranks.values())
        normalized_scores = dict([(u, float(1)/max_rank) for (u,l) in page_ranks.items()])
        return normalized_scores
    
    def link_text_score(self, rows, word_ids):
        """
        Score the page based on the text of the links to a page to decide how relevant the page is.
        """
        link_scores = dict([(row[0],0) for row in rows])
        c = self.conn.cursor()
        for word_id in word_ids:
            cur = c.execute('select link.from_id, link.to_id from linkwords, link where word_id = ? and linkwords.link_id = link.rowid', (word_id,))
            
            for (from_id, to_id) in cur:
                if to_id in link_scores:
                    pr = c.execute("""
                    select score 
                    from pagerank 
                    where url_id = ?""", (from_id,)).fetchone()[0]
                    link_scores[to_id] += pr
        max_score = max(link_scores.values())
        normalized_scores = dict([(u, float(l)/max_score) for (u,l) in link_scores.items()])
        return normalized_scores

    # import nn
    # net = nn.searchnet("nn.db")
#     def nn_score(self, rows, wordids):
#         # Get unique url ids as an ordered list.
#         urlids = [urlid for urlif in set([row[0] for row in rows])]
#         nnres = net.getresult(wordids, urlids)
#         scores = dict([(urlids[i], nnres[i]) for i in range(len(urlids))])
#         return self.normalize_scores(scores)

In [32]:
engine = Searcher('searchindex.db')
# engine.get_match_rows('functional programming')
engine.query('functional programming')

select w0.url_id,w0.location,w1.location from wordlocation w0,wordlocation w1 where w0.word_id = 706 and w0.url_id=w1.url_id and w1.word_id = 1
4.5	https://en.wikipedia.org/wiki/Functional_programming
2.882946001367054	https://en.wikipedia.org/wiki/Programming_language
2.508971291866029	https://en.wikipedia.org/w/index.php?title=Functional_programming&action=edit&section=33
2.5019224196855774	https://web.archive.org/web/20100715042920/http://www.math.grin.edu/~rebelsky/Courses/CS302/99S/Outlines/outline.02.html


In [33]:
engine.calculate_page_rank()

OperationalError: database is locked

In [None]:
[row for row in engine.conn.cursor().execute('select * from wordlist limit 10')]

## Content-based Ranking

- word frequency: the number of times the words in the query appear in the document can help determine how relevant the document is
- document location: the main subject of a document will probably appear near the beginning of the document
- word distance: if there are multiple words in the query, they should appear close together in the document
