In [4]:
# https://resources.oreilly.com/examples/9780596529321/tree/master

class crawler:
    # Initialize the crawler with the name of database.
    def __init__(self, db):
        pass
    
    def __del__(self):
        pass

    def commit(self):
        pass
    
    # Auxialliary function for getting an entry id and adding it if it is not present.
    def get_entry_id(self, table, field, value, create_new = True):
        return None
    
    # Index an individual page.
    def add_to_index(self, url, soup):
        print(f'indexing {url}')
    
    # Extract the text from the HTML page (no tags).
    def get_text_only(self, soup):
        return None
    
    # Separate the words by non-whitespace character.
    def separate_words(self, text):
        return None
    
    # Return true if this url is already indexed.
    def is_indexed(self, url):
        return False
    
    # Add a link between pages.
    def add_link_ref(self, url_from, url_to, link_text):
        pass

    # Starting with a list of pages, do a breadth first search to the given depth,
    # indexing pages as we go.
    def crawl(self, pages, depth = 2):
        pass
    
    # Create the database tables.
    def create_index_tables(self):
        pass

In [5]:
import urllib3
http = urllib3.PoolManager()
url = 'https://en.wikipedia.org/wiki/Programming_language'
# 'http://kiwitobes.com/wiki/Programming_language.html'
r = http.request('GET', url)
r.status, r.data[:500]



(200,
 b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Programming language - Wikipedia</title>\n<script>document.documentElement.className=document.documentElement.className.replace(/(^|\\s)client-nojs(\\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Programming_language","wgTitle":"Programming language","wgCurRevisionId":900725449,"wgRevisionId":900725449,"wgArticleId":23015')

In [6]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
stopwords[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [7]:
import re
import urllib3
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import sqlite3

http = urllib3.PoolManager()
url = 'https://en.wikipedia.org/wiki/Programming_language'
# pip3 install beautifulsoup4

class Crawler:
    # Initialize the crawler with the name of database.
    def __init__(self, db):
        self.conn = sqlite3.connect(db)
    
    def __del__(self):
        self.conn.close()

    def commit(self):
        self.conn.commit()
    
    # Auxialliary function for getting an entry id and adding it if it is not present.
    def get_entry_id(self, table, field, value, create_new = True):
        c = self.conn.cursor()
        res = c.execute(f'select rowid from {table} where {field} = ?', (value,)).fetchone()
        if res == None:
            stmt = f'insert into {table} ({field}) values (?)'
            r = self.conn.execute(stmt, (value,))
            return r.lastrowid
        else:
            return res[0]
    
    # Index an individual page.
    def add_to_index(self, url, soup):
        if self.is_indexed(url): return
        print(f'indexing {url}')
        
        # Get the individual words.
        text = self.get_text_only(soup)
        words = self.separate_words(text)
        
        # Get the URL id.
        url_id = self.get_entry_id('urllist', 'url', url)
        
        # Link each word to this url.
        for i in range(len(words)):
            word = words[i]
            if word in stopwords: continue
            word_id = self.get_entry_id('wordlist', 'word', word)
            self.conn.execute('insert into wordlocation(url_id, word_id, location) values (?, ?, ?)', (url_id, word_id, i))
    
    # Extract the text from the HTML page (no tags).
    def get_text_only(self, soup):
        return soup.get_text()
#         v = soup.string
#         if v == None:
#             c = soup.contents
#             result_text = ''
#             for t in c:
#                 subtext = self.get_text_only(t)
#                 result_text += subtext + '\n'
#             return result_text
#         else:
#             return v.strip()
    
    # Separate the words by non-whitespace character.
    def separate_words(self, text):
        # splitter = re.compile('\\W*')
        splitter=re.compile('\W+')
        return [s.lower() for s in splitter.split(text) if s != '']
    
    # Return true if this url is already indexed.
    def is_indexed(self, url):
        cur = self.conn.cursor()
        res = cur.execute('select url from urllist where url = ?', (url,)).fetchone()
        if res != None:
            # Check if it has actually been crawled.
            cur = self.conn.cursor()
            c = cur.execute('select * from wordlocation where url_id = ?', (res[0],))
            if c != None:
                res = c.fetchone()
                if res != None: return True
        return False
    
    # Add a link between pages.
    def add_link_ref(self, url_from, url_to, link_text):
        words = self.separate_words(link_text)
        from_id = self.get_entry_id('urllist', 'url', url_from)
        to_id = self.get_entry_id('urllist', 'url', url_to)
        if from_id == to_id: return
        c = self.conn.cursor()
        res = c.execute('insert into link(from_id,to_id) values (?, ?)', (from_id, to_id))
        link_id = res.lastrowid
        for word in words:
            if word in stopwords: continue
            word_id = self.get_entry_id('wordlist', 'word', word)
            self.conn.execute('insert into linkwords(link_id, word_id) values (?, ?)', (link_id, word_id))
    # Starting with a list of pages, do a breadth first search to the given depth,
    # indexing pages as we go.
    def crawl(self, pages, depth = 2):
        for i in range(depth):
            new_pages = set()
            for page in pages:
                try:
                    r = http.request('GET', page)
                    # r.status, r.data 
                    print(f'loaded page {page} {r.status}')
                except:
                    print(f'could not open page {page}')
                    continue
                soup = BeautifulSoup(r.data, 'html.parser')
                self.add_to_index(page, soup)
                
                links = soup.find_all('a')
                for link in links:
                    url = urljoin(page, link.get('href'))
                    # if url.find("'") != -1: continue
                    url = url.split('#')[0] # Remove location portion.
                    if url[0:5] == 'https' and not self.is_indexed(url):
                        print(f'indexing {url}')
                        new_pages.add(url)
                    link_text = self.get_text_only(link)
                    self.add_link_ref(page, url, link_text)
                self.commit()
            pages = new_pages
                
    # Create the database tables.
    def create_index_tables(self):
        self.conn.execute('create table if not exists urllist(url)')
        self.conn.execute('create table if not exists wordlist(word)')
        self.conn.execute('create table if not exists wordlocation(url_id, word_id, location)')
        self.conn.execute('create table if not exists link(from_id integer, to_id integer)')
        self.conn.execute('create table if not exists linkwords(word_id, link_id)')
        self.conn.execute('create index if not exists wordidx on wordlist(word)')
        self.conn.execute('create index if not exists urlidx on urllist(url)')
        self.conn.execute('create index if not exists wordurlidx on wordlocation(word_id)')
        self.conn.execute('create index if not exists urltoidx on link(to_id)')
        self.conn.execute('create index if not exists urlfromidx on link(from_id)')
        self.commit()

In [8]:
crawler = Crawler('searchindex.db')
crawler.create_index_tables()
crawler.crawl([
    'https://en.wikipedia.org/wiki/Programming_language',
    'https://en.wikipedia.org/wiki/Categorical_list_of_programming_languages.html',
    'https://en.wikipedia.org/wiki/Functional_programming'
])



loaded page https://en.wikipedia.org/wiki/Programming_language 200
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Wikipedia:Protection_policy
indexing https://en.wikipedia.org/wiki/Wikipedia:Pending_changes
indexing https://en.wikipedia.org/w/index.php?title=Special:Log&type=review&page=Programming_language
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/File:C_Hello_World_Program.png
indexing https://en.wikipedia.org/wiki/File:C_Hello_World_Program.png
indexing https://en.wikipedia.org/wiki/Source_code
indexing https://en.wikipedia.org/wiki/C_(programming_language)
indexing https://en.wikipedia.org/wiki/Compiled
indexing https://en.wikipedia.org/wiki/%22Hello,_World!%22_program
indexing https://en.wikipedia.org/wiki/Formal_language
indexing https://en.wikipedia.org/

indexing https://en.wikipedia.org/wiki/C_(programming_language)
indexing https://en.wikipedia.org/wiki/Unix
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Prolog
indexing https://en.wikipedia.org/wiki/Logic_programming
indexing https://en.wikipedia.org/wiki/ML_(programming_language)
indexing https://en.wikipedia.org/wiki/Lisp_(programming_language)
indexing https://en.wikipedia.org/wiki/Type_system
indexing https://en.wikipedia.org/wiki/Functional_programming
indexing https://en.wikipedia.org/wiki/Structured_programming
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Edsger_W._Dijkstra
indexing https://en.wikipedia.org/wiki/Communications_of_the_ACM
indexing https://en.wikipedia.org/wiki/Goto
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/w/index.php?title=Programming_language&action=edit&section=5
indexing https://en.wikipedia.org/wiki/F

indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Constructed_languages
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Programmer
indexing https://en.wikipedia.org/wiki/Microcontroller
indexing https://en.wikipedia.org/wiki/Supercomputer
indexing https://en.wikipedia.org/wiki/Abstraction_(computer_science)
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Natural_language_programming
indexing https://en.wikipedia.org/wiki/Edsger_W._Dijkstra
indexing https://en.wikipedia.org/wiki/Natural_language_programming
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Alan_Perlis
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Structured_English
indexing https://en.wikipedia.org/wiki/SQL
index

indexing https://en.wikipedia.org/wiki/W3C
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/International_Standard_Book_Number
indexing https://en.wikipedia.org/wiki/Special:BookSources/978-0-07-222942-4
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/International_Standard_Book_Number
indexing https://en.wikipedia.org/wiki/Special:BookSources/978-0-7645-8845-7
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://web.archive.org/web/20110511192712/http://www.ibm.com/developerworks/library/x-xslt/
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://web.archive.org/web/20110203015119/http://msdn.microsoft.com/en-us/library/ms767587(VS.85).aspx
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Morgan_Kaufmann
indexing https://en.wikipedia.org/wiki/International_Standard_Book_Number
indexing ht

indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://web.archive.org/web/20060107162045/http://www.levenez.com/lang/
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://web.archive.org/web/20090903084542/http://www.cs.cornell.edu/info/Projects/Nuprl/cs611/fall94notes/cn2/subsection3_1_3.html
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://web.archive.org/web/20120426010947/http://www.encyclopediaofmath.org/index.php?title=PL%2FI&oldid=19175
indexing https://en.wikipedia.org/wiki/Ada_(programming_language)
indexing https://en.wikipedia.org/wiki/UNCOL
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://web.archive.org/web/20080120201526/http://www.cs.utexas.edu/users/EWD/transcriptions/EWD06xx/EWD667.html
indexing https://en.wikipedia.org/wiki/Wayback_Machine
indexing https://en.wikipedia.org/wiki/Programming_language
indexing 

indexing https://en.wikipedia.org/wiki/Programming_language
indexing https://en.wikipedia.org/wiki/Query_language
indexing https://en.wikipedia.org/wiki/Shading_language
indexing https://en.wikipedia.org/wiki/Specification_language
indexing https://en.wikipedia.org/wiki/Style_sheet_language
indexing https://en.wikipedia.org/wiki/Transformation_language
indexing https://en.wikipedia.org/wiki/Category:Lists_of_computer_languages
indexing https://en.wikipedia.org/wiki/Category:Lists_of_programming_languages
indexing https://en.wikipedia.org/wiki/List_of_programming_languages_by_type
indexing https://en.wikipedia.org/wiki/Help:Authority_control
indexing https://www.wikidata.org/wiki/Q9143
indexing https://en.wikipedia.org/wiki/Biblioth%C3%A8que_nationale_de_France
indexing https://catalogue.bnf.fr/ark:/12148/cb13318353n
indexing https://data.bnf.fr/ark:/12148/cb13318353n
indexing https://en.wikipedia.org/wiki/Integrated_Authority_File
indexing https://d-nb.info/gnd/4047409-4
indexing https

indexing https://io.wikipedia.org/wiki/Programifo-lingui
indexing https://ilo.wikipedia.org/wiki/Lengguahe_ti_panangprograma
indexing https://id.wikipedia.org/wiki/Bahasa_pemrograman
indexing https://ia.wikipedia.org/wiki/Linguage_de_programmation
indexing https://ie.wikipedia.org/wiki/Lingua_de_programmation
indexing https://is.wikipedia.org/wiki/Forritunarm%C3%A1l
indexing https://it.wikipedia.org/wiki/Linguaggio_di_programmazione
indexing https://he.wikipedia.org/wiki/%D7%A9%D7%A4%D7%AA_%D7%AA%D7%9B%D7%A0%D7%95%D7%AA
indexing https://jv.wikipedia.org/wiki/Basa_pamrograman
indexing https://ka.wikipedia.org/wiki/%E1%83%9E%E1%83%A0%E1%83%9D%E1%83%92%E1%83%A0%E1%83%90%E1%83%9B%E1%83%98%E1%83%A0%E1%83%94%E1%83%91%E1%83%98%E1%83%A1_%E1%83%94%E1%83%9C%E1%83%90
indexing https://kk.wikipedia.org/wiki/%D0%91%D0%B0%D2%93%D0%B4%D0%B0%D1%80%D0%BB%D0%B0%D0%BC%D0%B0%D0%BB%D0%B0%D1%83_%D1%82%D1%96%D0%BB%D1%96
indexing https://ky.wikipedia.org/wiki/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0



loaded page https://en.wikipedia.org/wiki/Categorical_list_of_programming_languages.html 404
indexing https://en.wikipedia.org/wiki/Categorical_list_of_programming_languages.html
indexing https://en.wikipedia.org/wiki/Categorical_list_of_programming_languages.html
indexing https://en.wikipedia.org/wiki/Categorical_list_of_programming_languages.html
indexing https://en.wikipedia.org/wiki/Categorical_list_of_programming_languages.html
indexing https://en.wikipedia.org/wiki/Special:SiteMatrix
indexing https://en.wikipedia.org/wiki/File:Wiktionary-logo-v2.svg
indexing https://en.wiktionary.org/wiki/Special:Search/Categorical_list_of_programming_languages.html
indexing https://en.wikipedia.org/wiki/File:Wikibooks-logo.svg
indexing https://en.wikibooks.org/wiki/Special:Search/Categorical_list_of_programming_languages.html
indexing https://en.wikipedia.org/wiki/File:Wikiquote-logo.svg
indexing https://en.wikiquote.org/wiki/Special:Search/Categorical_list_of_programming_languages.html
indexing



indexing https://en.wikipedia.org/wiki/Functional_programming


KeyboardInterrupt: 

In [9]:
[row for row in crawler.conn.execute('select rowid from wordlocation where word_id = 1')]

[(1,),
 (25,),
 (118,),
 (121,),
 (461,),
 (489,),
 (497,),
 (508,),
 (512,),
 (515,),
 (528,),
 (580,),
 (587,),
 (606,),
 (622,),
 (745,),
 (757,),
 (770,),
 (776,),
 (811,),
 (823,),
 (835,),
 (840,),
 (847,),
 (857,),
 (866,),
 (880,),
 (926,),
 (941,),
 (946,),
 (988,),
 (1003,),
 (1032,),
 (1037,),
 (1041,),
 (1063,),
 (1078,),
 (1087,),
 (1101,),
 (1175,),
 (1184,),
 (1213,),
 (1217,),
 (1225,),
 (1293,),
 (1317,),
 (1399,),
 (1425,),
 (1456,),
 (1498,),
 (1501,),
 (1542,),
 (1545,),
 (1551,),
 (1574,),
 (1584,),
 (1599,),
 (1612,),
 (1626,),
 (1632,),
 (1646,),
 (1648,),
 (1667,),
 (1676,),
 (1681,),
 (1695,),
 (1702,),
 (1722,),
 (1748,),
 (1774,),
 (1803,),
 (1836,),
 (1838,),
 (1862,),
 (1866,),
 (1880,),
 (1883,),
 (1899,),
 (1930,),
 (1950,),
 (1956,),
 (1973,),
 (2012,),
 (2233,),
 (2240,),
 (2396,),
 (2417,),
 (2478,),
 (2492,),
 (2512,),
 (2624,),
 (2854,),
 (3030,),
 (3115,),
 (3246,),
 (3281,),
 (3315,),
 (3318,),
 (3337,),
 (3349,),
 (3415,),
 (3426,),
 (3434,),
 (34

In [12]:
class Searcher:
    def __init__(self, db):
        self.conn = sqlite3.connect(db)
    
    def __del__(self):
        self.conn.close()
    
    def get_match_rows(self, q):
        # Strings to build the query.
        fieldlist = 'w0.url_id'
        tablelist = ''
        clauselist = ''
        wordids = []
        
        # Split the words by spaces.
        words = q.split(' ')
        tablenumber = 0
        
        for word in words:
            # Get the word id.
            c = self.conn.cursor()
            wordrow = c.execute('select rowid from wordlist where word=?', (word,)).fetchone()
            if wordrow != None:
                wordid = wordrow[0]
                wordids.append(wordid)
                if tablenumber > 0:
                    tablelist += ','
                    clauselist += ' and '
                    clauselist += f'w{tablenumber-1}.url_id=w{tablenumber}.url_id and '
                fieldlist += f',w{tablenumber}.location'
                tablelist += f'wordlocation w{tablenumber}'
                clauselist += f'w{tablenumber}.word_id = {wordid}'
                tablenumber += 1
        # Create the query from the separate parts.
        fullquery = f'select {fieldlist} from {tablelist} where {clauselist}'
        print(fullquery)
        c = self.conn.cursor()
        res = c.execute(fullquery)
        rows = [row for row in res]
        return rows, wordids
    
    def get_scored_list(self, rows, word_ids):
        total_scores = dict([(row[0], 0) for row in rows])
        
        # Put the scoring function here.
        weights = []
        
        for (weight, scores) in weights:
            for url in total_scores:
                total_scores[url] += weight * scores[url]
        
        return total_scores

    def get_url_name(self, id):
        return self.conn.cursor().execute('select url from urllist where rowid = ?', (id,)).fetchone()[0]
    
    def query(self, q):
        rows, word_ids = self.get_match_rows(q)
        scores = self.get_scored_list(rows, word_ids)
        ranked_scores = sorted([(score, url) for (url, score) in scores.items()], reverse=True)
        for (score, url_id) in ranked_scores[0:10]:
            print(f'{score}\t{self.get_url_name(url_id)}')

In [13]:
engine = Searcher('searchindex.db')
# engine.get_match_rows('functional programming')
engine.query('functional programming')

select w0.url_id,w0.location,w1.location from wordlocation w0,wordlocation w1 where w0.word_id = 706 and w0.url_id=w1.url_id and w1.word_id = 1
0	https://en.wikipedia.org/w/index.php?title=Functional_programming&action=edit&section=33
0	https://web.archive.org/web/20100715042920/http://www.math.grin.edu/~rebelsky/Courses/CS302/99S/Outlines/outline.02.html
0	https://en.wikipedia.org/wiki/Functional_programming
0	https://en.wikipedia.org/wiki/Programming_language


In [None]:
[row for row in engine.conn.cursor().execute('select * from wordlist limit 10')]

## Content-based Ranking

- word frequency: the number of times the words in the query appear in the document can help determine how relevant the document is
- document location: the main subject of a document will probably appear near the beginning of the document
- word distance: if there are multiple words in the query, they should appear close together in the document
