In [1]:
#search engine stuff
#this is defined at the very start of the chapter and then added to
#during the chapter

import urllib
import urllib.request
import sqlite3 as sql
import os
import re

class Crawler:
    #initialise with name of the db
    def __init__(self, db_name):
        self.con = sql.connect(db_name)
    
    def __del__(self):
        self.con.close()
    
    def db_commit(self):
        self.con.commit()
    
    def get_entry_id(self, table, field, value, create_new=True):
        #function for getting entry id and adding if not present
        cur = self.con.execute("select rowid from %s where %s='%s'" % (table,field,value))
        res = cur.fetchone()
        if res == None:
            cur = self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value))
            return cur.lastrowid
        else:
            return res[0]
    
    def add_to_index(self, url, soup):
        #index an individual page
        if self.is_indexed(url):
            return
        print("Indexing %s" % url)
        
        #get individual words
        text = self.get_text_only(soup)
        words = self.separate_words(text)
        
        #get url id
        url_id = self.get_entry_id('url_list', 'url', url)
        
        #link each word to this url
        for i, w in enumerate(words):
            word = w
            if word in ignore_words:
                continue
            word_id = self.get_entry_id('word_list', 'word', word)
            self.con.execute('insert into word_location(url_id,word_id,location) values (%d,%d,%d)' % (url_id,word_id,i))
        
    def get_text_only(self, soup):
        #extract text from a HTML page (no tags)
        v = soup.string
        if v == None:
            c = soup.contents
            result_text = ''
            for t in c:
                sub_text = self.get_text_only(t)
                result_text += sub_text + '\n'
            return result_text
        else:
            return v.strip()
    
    def separate_words(self, text):
        #separate words by any non-whitespace character
        splitter = re.compile('\\W*')
        return [s.lower() for s in splitter.split(text) if s != '']
        return None
    
    def is_indexed(self, url):
        #return true if this url is already indexed
        u = self.con.execute("select rowid from url_list where url='%s'" % url).fetchone()
        if u != None:
            #check if it has actually been crawled
            v = self.con.execute("select * from word_location where url_id=%d" % u[0]).fetchone()
            if v != None:
                return True
        return False
    
    def add_link_ref(self, url_from, url_to, link_text):
        #add link between two pages
        pass
    
    def crawl(self, pages, depth=2):
        #starting with list of pages do BFS to depth given
        #indexing pages as we go
        for i in range(depth):
            new_pages = set()
            for page in pages:
                try:
                    c = urllib.request.urlopen(page)
                except:
                    print("Could not open %s" % page)
                    continue
                soup = bs4.BeautifulSoup(c.read())
                self.add_to_index(page, soup)
                
                links = soup('a')
                for link in links:
                    if ('href' in dict(link.attrs)):
                        url = urllib.parse.urljoin(page, link['href'])
                        if url.find("'") != -1:
                            continue
                        url = url.split('#')[0] #remove location portion
                        if url[0:4] == 'http' and not self.is_indexed(url):
                            new_pages.add(url)
                        link_text = self.get_text_only(link)
                        self.add_link_ref(page, url, link_text)
                        
                self.db_commit()
                
            pages = new_pages
        
    def create_index_tables(self):
        #create the databse tables
        self.con.execute('create table url_list(url)')
        self.con.execute('create table word_list(word)')
        self.con.execute('create table word_location(url_id,word_id,location)')
        self.con.execute('create table link(from_id integer,to_id integer)')
        self.con.execute('create table link_words(word_id,link_id)')
        self.con.execute('create index word_idx on word_list(word)')
        self.con.execute('create index url_idx on url_list(url)')
        self.con.execute('create index word_url_idx on word_location(word_id)')
        self.con.execute('create index url_to_idx on link(to_id)')
        self.con.execute('create index url_from_idx on link(from_id)')
        self.db_commit( )

In [2]:
#following links from one page to linked pages is called crawling or spidering
#need to download pages, pass to indexer (will build next)
#parse pages to find links to pages to be crawled next

import urllib.request

c = urllib.request.urlopen('http://en.wikipedia.org/wiki/Programming_language')
contents = c.read().decode('utf-8')
print(contents[:250])

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>Programming language - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(


In [3]:
#using beautiful soup you can build a crawler that likes a list of urls
#to index and crawl their links to find other pages to index

import bs4

#create a list of words to ignore
ignore_words = set(['the','of','to','and','a','in','is','it'])

In [4]:
#update the Crawler.crawl method

In [5]:
page_list = ['http://en.wikipedia.org/wiki/Programming_language']
crawler = Crawler('')
#crawler.crawl(page_list) #this runs forever

#loops through list of pages calling add_to_index on each one
#(which currently does nothing except print the URL)
#then uses BS to get all links on that page and adds their url
#to new_pages, at the end of the loop, new_pages becomes pages
#and the process begins again

#this is recursive so each links calls the function again
#BFS allows for easier modification of the code later, either to keep
#crawling forever or to save a list of unindexed pages for later crawling

In [6]:
#next step to build the database for the full-text index
#index is a list of all the different words along with the documents
#in which they appear and their locations in documents
#we will only be looking at text on page and ignoring non-text elements
#indexing words with all punctuation removed

#use SQLite to create a database
#instead of a server, stores in a single file

In [7]:
#change init, del and db_commit methods to open and close the db

In [8]:
#need to prepare the db
#need 5 tables
#url_list -> list of urls that have been indexed
#word_list -> list of words
#word_location -> list of locations of words in documents
#link and link_words -> links between documents
#link -> two URL IDs indicating a link from one table to another
#link_words -> uses word_id and link_id columns to store which words are
#  actually used in that link

#all tables in SQLiute have a field called rowid by default
#so no need to explicitly specify an ID for those tables

In [9]:
#update the create_index_tables method
#this will create a schema for all tables you will be using
#along with indices to speed up searching
#indices are important as dataset can get very large

In [10]:
#crawler = Crawler('search_index.db')
#crawler.create_index_tables()

In [11]:
#the files currently being downloaded are HTML so have a lot of tags
#and other information that doesn't belong in the index
#need to extract all parts of the page that aren't text

#do this by searching the soup for text nodes and collecting all of 
#their content

In [12]:
#update get_text_only method

#now returns a long string containing all the text on the page
#recursively traverses down the HTML document object model
#looking for text nodes
#text in separate sections is separated into different paragraphs
#need to preserve order

In [13]:
#update separate_words function

#splits a string into a list of separate words so they can be added to
#to the index
#it is very basic and considers everything non-alphanumberic to be a
#separator

In [14]:
#update add_to_index method

#now calls two functions defined previously to get list of words on the page
#then adds the page and all the words to the index
#and creates links between them with their locations in the document
#here, the location will be the index within the list of words

In [15]:
#update get_entry_id

#all this does is return ID of an entry if it exists, if not an ID
#is created and then returned

In [16]:
#update is_indexed

#finally need to fill in the code which determines whether the page
#is already in the database, and if so whether are are any words
#associated with it

In [17]:
crawler = Crawler('search_index.db')
pages = ['http://en.wikipedia.org/wiki/Programming_language']
crawler.crawl(pages)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


Indexing http://en.wikipedia.org/wiki/Programming_language




Indexing https://sr.wikipedia.org/wiki/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%81%D0%BA%D0%B8_%D1%98%D0%B5%D0%B7%D0%B8%D0%BA
Indexing http://www.ibm.com/developerworks/library/x-xslt/
Indexing http://en.wikipedia.org/wiki/Declaration_(computer_science)
Indexing http://en.wikipedia.org/w/index.php?title=Template:Programming_languages&action=edit
Indexing http://en.wikipedia.org/wiki/Machine_instruction
Indexing https://rue.wikipedia.org/wiki/%D0%AF%D0%B7%D1%8B%D0%BA_%D0%BF%D1%80%D0%BE%D2%91%D1%80%D0%B0%D0%BC%D0%BE%D0%B2%D0%B0%D0%BD%D1%8F
Indexing http://en.wikipedia.org/wiki/Java_(programming_language)
Indexing http://en.wikipedia.org/wiki/Michael_Sipser
Indexing https://de.wikipedia.org/wiki/Programmiersprache
Could not open http://doi.org/10.1511%2F2006.60.299
Indexing https://id.wikipedia.org/wiki/Bahasa_pemrograman
Indexing http://en.wikipedia.org/wiki/XSLT
Indexing http://en.wikipedia.org/wiki/File:Python_add5_parse.png
Indexing http://en.wikipedia.org/wiki/Computer_science
C

Indexing http://en.wikipedia.org/wiki/Compiled
Indexing http://en.wikipedia.org/wiki/Standard_Generalized_Markup_Language
Indexing http://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License
Indexing http://en.wikipedia.org/wiki/Turing_completeness
Indexing http://en.wikipedia.org/wiki/Swift_(programming_language)
Indexing https://lv.wikipedia.org/wiki/Programm%C4%93%C5%A1anas_valoda
Indexing http://en.wikipedia.org/wiki/Atlas_Computer_(Manchester)
Indexing http://en.wikipedia.org/w/index.php?title=Programming_language&action=edit&section=11
Indexing https://sco.wikipedia.org/wiki/Programmin_leid
Indexing http://en.wikipedia.org/wiki/English_language
Indexing http://en.wikipedia.org/wiki/File:Python_add5_syntax.svg
Indexing http://en.wikipedia.org/w/index.php?title=Programming_language&action=edit&section=20
Indexing http://en.wikipedia.org/wiki/Microcontroller
Indexing http://en.wikipedia.org/wiki/Context-free_grammar
Indexing http://en.

Indexing https://an.wikipedia.org/wiki/Luengache_de_programaci%C3%B3n
Indexing http://en.wikipedia.org/w/index.php?title=Programming_language&action=edit&section=4
Indexing http://en.wikipedia.org/wiki/File:C_Hello_World_Program.png
Indexing https://als.wikipedia.org/wiki/Programmiersprache
Indexing http://en.wikipedia.org/wiki/Visual_Basic_.NET
Indexing https://ja.wikipedia.org/wiki/%E3%83%97%E3%83%AD%E3%82%B0%E3%83%A9%E3%83%9F%E3%83%B3%E3%82%B0%E8%A8%80%E8%AA%9E
Indexing http://en.wikipedia.org/wiki/Internet
Indexing http://www.mactech.com/articles/mactech/Vol.15/15.09/ScriptingLanguages/index.html
Indexing https://th.wikipedia.org/wiki/%E0%B8%A0%E0%B8%B2%E0%B8%A9%E0%B8%B2%E0%B9%82%E0%B8%9B%E0%B8%A3%E0%B9%81%E0%B8%81%E0%B8%A3%E0%B8%A1
Indexing http://en.wikipedia.org/wiki/Printer_(computing)
Indexing http://en.wikipedia.org/wiki/Exception_handling
Indexing http://en.wikipedia.org/w/index.php?title=Programming_language&action=edit&section=21
Indexing http://en.wikipedia.org/wiki/Compa

Indexing http://en.wikipedia.org/wiki/Concepts,_Techniques,_and_Models_of_Computer_Programming
Indexing https://mn.wikipedia.org/wiki/%D0%9F%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D1%87%D0%BB%D0%B0%D0%BB%D1%8B%D0%BD_%D1%85%D1%8D%D0%BB
Indexing http://en.wikipedia.org/wiki/Type_system
Indexing http://en.wikipedia.org/wiki/Special:BookSources/9780199693795
Indexing http://en.wikipedia.org/wiki/History_of_programming_languages
Indexing http://www.computerworld.com.au/article/319269/cobol_turns_50/
Indexing http://en.wikipedia.org/wiki/Modula-2
Indexing http://en.wikipedia.org/w/index.php?title=Programming_language&action=edit&section=1
Indexing http://catalogue.bnf.fr/ark:/12148/cb13318353n
Indexing http://www.cs.brown.edu/~sk/Publications/Books/ProgLangs/
Indexing http://en.wikipedia.org/wiki/Portal:Computer_programming
Indexing http://en.wikipedia.org/wiki/Scripting_language
Indexing http://en.wikipedia.org/wiki/Anonymous_function
Indexing http://mitpress.mit.edu/sicp/full-text/book/book-Z

Indexing https://hsb.wikipedia.org/wiki/Program%C4%9Browanske_r%C4%9B%C4%8De
Indexing https://xmf.wikipedia.org/wiki/%E1%83%9E%E1%83%A0%E1%83%9D%E1%83%92%E1%83%A0%E1%83%90%E1%83%9B%E1%83%98%E1%83%A0%E1%83%90%E1%83%A4%E1%83%90%E1%83%A8_%E1%83%9C%E1%83%98%E1%83%9C%E1%83%90
Indexing http://en.wikipedia.org/wiki/Actor_model
Indexing http://en.wikipedia.org/wiki/Category:Notation
Indexing https://ko.wikipedia.org/wiki/%ED%94%84%EB%A1%9C%EA%B7%B8%EB%9E%98%EB%B0%8D_%EC%96%B8%EC%96%B4
Indexing https://am.wikipedia.org/wiki/%E1%8B%A8%E1%8D%95%E1%88%AE%E1%8C%8D%E1%88%AB%E1%88%9D_%E1%89%8B%E1%8A%95%E1%89%8B
Indexing http://en.wikipedia.org/wiki/High-level_language
Indexing http://en.wikipedia.org/wiki/Invariant_based_programming
Indexing http://en.wikipedia.org/wiki/Introduction_to_the_Theory_of_Computation
Indexing http://en.wikipedia.org/w/index.php?title=Programming_language&action=edit&section=7
Indexing http://en.wikipedia.org/wiki/Subroutine
Indexing http://en.wikipedia.org/wiki/Identifier


Indexing http://en.wikipedia.org/wiki/Low-level_programming_language
Indexing http://en.wikipedia.org/wiki/Special:BookSources/0-534-94728-X
Indexing http://en.wikipedia.org/wiki/Haskell_(programming_language)
Could not open http://radar.oreilly.com/archives/2006/08/programming_language_trends_1.html
Indexing http://en.wikipedia.org/wiki/Oracle_Corporation
Indexing http://en.wikipedia.org/w/index.php?title=Special:UserLogin&returnto=Programming+language
Indexing http://en.wikipedia.org/wiki/Formal_semantics_of_programming_languages
Indexing http://en.wikipedia.org/wiki/Class-based_programming
Indexing http://www.developer.com/lang/other/7-myths-of-cobol-debunked.html
Indexing http://en.wikipedia.org/wiki/Website
Indexing http://en.wikipedia.org/wiki/Theory_of_computation
Indexing http://en.wikipedia.org/wiki/BCPL
Could not open http://en.wikipedia.org/w/index.php?title=Muffy_Thomas&action=edit&redlink=1
Indexing https://it.wikipedia.org/wiki/Linguaggio_di_programmazione
Indexing http:/

In [18]:
#can check the entries for a word by querying the database
[row for row in crawler.con.execute('select rowid from word_location where word_id=1')]
#if using his data, should print 
#[(1,), (46,), (330,), (232,), (406,), (271,), (192,),...

[(1,),
 (1050,),
 (6003,),
 (6016,),
 (6611,),
 (7119,),
 (8506,),
 (11188,),
 (11982,),
 (12087,),
 (12122,),
 (12347,),
 (12541,),
 (12544,),
 (12556,),
 (12933,),
 (13551,),
 (13575,),
 (13658,),
 (13708,),
 (13718,),
 (13730,),
 (13832,),
 (13996,),
 (14371,),
 (14566,),
 (14942,),
 (15258,),
 (17074,),
 (18642,),
 (21432,),
 (22417,),
 (26125,),
 (26129,),
 (26142,),
 (26143,),
 (26162,),
 (26167,),
 (26171,),
 (26200,),
 (26300,),
 (26351,),
 (26470,),
 (26530,),
 (26541,),
 (30570,),
 (32245,),
 (36240,),
 (37826,),
 (37850,),
 (37871,),
 (38325,),
 (39030,),
 (39523,),
 (41245,),
 (41273,),
 (41322,),
 (41352,),
 (42442,),
 (42550,),
 (42578,),
 (43229,),
 (44439,),
 (52527,),
 (61834,),
 (63616,),
 (65568,),
 (66112,),
 (67249,),
 (69736,),
 (71425,),
 (72424,),
 (78454,),
 (78913,),
 (80761,),
 (83373,),
 (86355,),
 (92628,),
 (93260,),
 (94233,),
 (95872,),
 (99938,),
 (100586,),
 (101893,),
 (102242,),
 (102877,),
 (104832,),
 (105625,),
 (108898,),
 (114928,),
 (115387,),


In [None]:
#now have a working crawler with large collection of documents indexed
#now can do the actual searching

class Searcher:
    def __init__(self, db_name):
        self.con=sqlite.connect(db_name)
        
    def __del__(self):
        self.con.close()
        
#PDF PAGE 86, REAL PAGE 63