### Creating a spider which crawls from one page, goes to the links in the pages and then crwals over each link page it got

In [6]:
import sqlite3
import urllib.error
import ssl
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.request import urlopen
from bs4 import BeautifulSoup

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

conn = sqlite3.connect('spider-new-url.sqlite')
cur = conn.cursor()

cur.execute('''CREATE TABLE IF NOT EXISTS Pages
    (id INTEGER PRIMARY KEY, url TEXT UNIQUE, html TEXT,
     error INTEGER, old_rank REAL, new_rank REAL)''')

cur.execute('''CREATE TABLE IF NOT EXISTS Links
    (from_id INTEGER, to_id INTEGER, UNIQUE(from_id, to_id))''')

cur.execute('''CREATE TABLE IF NOT EXISTS Webs (url TEXT UNIQUE)''')

cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')

row = cur.fetchone()
if row is not None:
    print("Restarting existing crawl.  Remove spider-wiki.sqlite to start a fresh crawl.")
else :
    starturl = input('Enter web url or enter: ')
    if ( len(starturl) < 1 ) : starturl = 'https://news.google.com/'
    if ( starturl.endswith('/') ) : starturl = starturl[:-1]
    web = starturl
    if ( starturl.endswith('.htm') or starturl.endswith('.html') ) :
        pos = starturl.rfind('/')
        web = starturl[:pos]

    if ( len(web) > 1 ) :
        cur.execute('INSERT OR IGNORE INTO Webs (url) VALUES ( ? )', ( web, ) )
        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( starturl, ) )
        conn.commit()

cur.execute('''SELECT url FROM Webs''')

webs = list()
for row in cur:
    webs.append(str(row[0]))

print(webs)

many = 0
while True:
    if ( many < 1 ) :
        sval = input('How many pages:')
        if ( len(sval) < 1 ) : break
        many = int(sval)
    many = many - 1

    cur.execute('SELECT id,url FROM Pages WHERE html is NULL and error is NULL ORDER BY RANDOM() LIMIT 1')
    try:
        row = cur.fetchone()
        # print row
        fromid = row[0]
        url = row[1]
    except Exception as e:
        print(e)
        print('No unretrieved HTML pages found')
        many = 0
        break

    print(fromid, url, end=' ')

    # If we are retrieving this page, there should be no links from it
    cur.execute('DELETE from Links WHERE from_id=?', (fromid, ) )
    try:
        document = urlopen(url, context=ctx)

        html = document.read()
        if document.getcode() != 200 :
            print("Error on page: ",document.getcode())
            cur.execute('UPDATE Pages SET error=? WHERE url=?', (document.getcode(), url) )

        if 'text/html' != document.info().get_content_type() :
            print("Ignore non text/html page")
            cur.execute('DELETE FROM Pages WHERE url=?', ( url, ) )
            conn.commit()
            continue

        print('('+str(len(html))+')', end=' ')

        soup = BeautifulSoup(html, "html.parser")
    except KeyboardInterrupt:
        print('')
        print('Program interrupted by user...')
        break
    except:
        print("Unable to retrieve or parse page")
        cur.execute('UPDATE Pages SET error=-1 WHERE url=?', (url, ) )
        conn.commit()
        continue

    cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( url, ) )
    cur.execute('UPDATE Pages SET html=? WHERE url=?', (memoryview(html), url ) )
    conn.commit()

    # Retrieve all of the anchor tags
    tags = soup('a')
    count = 0
    for tag in tags:
        href = tag.get('href', None)
        if ( href is None ) : continue
        # Resolve relative references like href="/contact"
        up = urlparse(href)
        if ( len(up.scheme) < 1 ) :
            href = urljoin(url, href)
        ipos = href.find('#')
        if ( ipos > 1 ) : href = href[:ipos]
        if ( href.endswith('.png') or href.endswith('.jpg') or href.endswith('.gif') ) : continue
        if ( href.endswith('/') ) : href = href[:-1]
        # print href
        if ( len(href) < 1 ) : continue

		# Check if the URL is in any of the webs
        found = False
        for web in webs:
            if ( href.startswith(web) ) :
                found = True
                break
        if not found : continue

        cur.execute('INSERT OR IGNORE INTO Pages (url, html, new_rank) VALUES ( ?, NULL, 1.0 )', ( href, ) )
        count = count + 1
        conn.commit()

        cur.execute('SELECT id FROM Pages WHERE url=? LIMIT 1', ( href, ))
        try:
            row = cur.fetchone()
            toid = row[0]
        except:
            print('Could not retrieve id')
            continue
        # print fromid, toid
        cur.execute('INSERT OR IGNORE INTO Links (from_id, to_id) VALUES ( ?, ? )', ( fromid, toid ) )


    print(count)
    
cur.close()




Enter web url or enter: 
['https://news.google.com']
How many pages:100
1 https://news.google.com (1346561) 129
2 https://news.google.com/?hl=en-IN&gl=IN&ceid=IN%3Aen (1338445) 129
3 https://news.google.com/?tab=nn (1338543) 129
71 https://news.google.com/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNRE53T0RVU0FtVnVLQUFQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen (1723479) 293
143 https://news.google.com/topics/articles/CAIiEFwEHbghgzmDp64ximpWJtAqGQgEKhAIACoHCAowsLXdCjCm3dEBMNS-swY?hl=en-IN&gl=IN&ceid=IN%3Aen (1338472) 129
194 https://news.google.com/topics/articles/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pKVGlnQVAB?hl=en-IN&gl=IN&ceid=IN%3Aen (1346532) 129
18 https://news.google.com/articles/CAIiENVz4o18nfi9UNy8o1vSlBEqGQgEKhAIACoHCAowj8n_CjDIrfkCMNWczwY?hl=en-IN&gl=IN&ceid=IN%3Aen (833106) 14
146 https://news.google.com/topics/articles/CAIiENLiDXjf9RUsmpk-tj0Ixx4qGQgEKhAIACoHCAowzrL9CjDC7vQCMJmD1wU?hl=en-IN&gl=IN&ceid=IN%3Aen (1346511) 129
237 https://news.google.com/topics/articles/articles/C

10 https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pKVGlnQVAB?hl=en-IN&gl=IN&ceid=IN%3Aen (3505788) 784
584 https://news.google.com/topics/topics/stories/articles/CAIiEB7ha0OobjT3hoeX1vQSmwMqGQgEKhAIACoHCAowzrL9CjDC7vQCMJmD1wU?hl=en-IN&gl=IN&ceid=IN%3Aen (1333826) 130
742 https://news.google.com/topics/articles/articles/topics/articles/CAIiEGSkLSkkLtjjC9k25Bza41oqGQgEKhAIACoHCAowj8n_CjDIrfkCMJWZ2AY?hl=en-IN&gl=IN&ceid=IN%3Aen (1341931) 130
525 https://news.google.com/topics/topics/articles/CAIiEPGWktwDn17tc6m9jUfv23kqGQgEKhAIACoHCAow55veCjDzvdUBMIPh5gU?hl=en-IN&gl=IN&ceid=IN%3Aen (1341971) 130
773 https://news.google.com/topics/articles/articles/topics/publications/CAAqBwgKMOaqjQsw-KufAw?hl=en-IN&gl=IN&ceid=IN%3Aen (1341929) 130
2128 https://news.google.com/topics/articles/topics/articles/topics/topics/articles/CBMihQFodHRwczovL3d3dy5pbmRpYXRvZGF5LmluL2ZhY3QtY2hlY2svc3RvcnkvZmFjdC1jaGVjay1wbS1tb2RpLW5ldmVyLXNhaWQtdmFqcGF5ZWUtc3RhcnRlZC10aGUtZmlyc3QtbWV0cm8t

3037 https://news.google.com/topics/articles/stories/CAAqOQgKIjNDQklTSURvSmMzUnZjbmt0TXpZd1NoTUtFUWl4aGVUS2tZQU1FWmRvb1R4cGg3VXJLQUFQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen (1333608) 131
553 https://news.google.com/topics/topics/articles/CAIiEAx6XBamETd3L4rB_9mtwnwqGQgEKhAIACoHCAow5qqNCzD4q58DMMPepwY?hl=en-IN&gl=IN&ceid=IN%3Aen (1334289) 131
1614 https://news.google.com/topics/articles/topics/articles/articles/topics/CAAqIQgKIhtDQkFTRGdvSUwyMHZNRE55YXpBU0FtVnVLQUFQAQ?hl=en-IN&gl=IN&ceid=IN%3Aen (1334316) 131
1189 https://news.google.com/topics/articles/articles/topics/articles/articles/CAIiEGSkLSkkLtjjC9k25Bza41oqGQgEKhAIACoHCAowj8n_CjDIrfkCMJWZ2AY?hl=en-IN&gl=IN&ceid=IN%3Aen (1334312) 131
2076 https://news.google.com/topics/articles/topics/articles/topics/topics/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRFp1ZEdvU0FtVnVHZ0pKVGlnQVAB?hl=en-IN&gl=IN&ceid=IN%3Aen (1334302) 131
2384 https://news.google.com/topics/publications/CAAiEPLaWn1TGx_bRcUvsOZFLA4qFAgKIhDy2lp9Uxsf20XFL7DmRSwO?hl=en-IN&gl=IN&ceid=IN%