In [1]:
#Inspired by the work of https://github.com/hardikvasa/wikipedia-crawler

In [2]:
import time     #For Delay
import urllib.request    #Extracting web pages
import re
import string

In [3]:
#Defining pages
starting_page = "http://gossipgirl.wikia.com/wiki/Serena_van_der_Woodsen"
seed_page = "http://gossipgirl.wikia.com/"  #Crawling the English Wikipedia

In [4]:
#Downloading entire Web Document (Raw Page Content)
def download_page(url):
    try:
        headers = {}
        headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
        req = urllib.request.Request(url, headers = headers)
        resp = urllib.request.urlopen(req)
        respData = str(resp.read())
        return respData
    except Exception as e:
        print(str(e))

In [5]:
#Extract the title tag
def extract_info(page, start_tag, end_tag):
    start = page.find(start_tag)
    end = page.find(end_tag, start + 1)
    n = len(start_tag)
    info = page[start + n : end]
    return info, end

In [6]:
#Name
page = download_page(starting_page)
start_tag = r'<h1 class="page-header__title">'
end_tag = "</h1>"
extract_info(page, start_tag, end_tag)[0]

'Serena van der Woodsen'

In [7]:
page

'b\'<!doctype html>\\n<html lang="en" dir="ltr" class="">\\n<head>\\n\\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\\n\\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\\n<meta name="generator" content="MediaWiki 1.19.24" />\\n<meta name="keywords" content="Gossip Girl Wiki,gossipgirl,Serena van der Woodsen,Dan Humphrey,William van der Woodsen,Lily van der Woodsen,Eric van der Woodsen,Jenny Humphrey,Milo Humphrey,Scott Rosson,Lola Rhodes,Chuck Bass,CeCe Rhodes" />\\n<meta name="description" content="Serena Celia Humphrey (n\\xc3\\xa9e van der Woodsen) is a main character in the Gossip Girl series of novels and its television adaption, in which she is portrayed by Blake Lively. Throughout the series, Serena has an on-again, off-again relationship with Dan Humphrey and the two have the longest running..." />\\n<meta name="twitter:card" content="summary" />\\n<meta name="twitter:site" content="@getfandom" />\\n<meta name="t

In [8]:
#Extract all the links
#Finding 'Next Link' on a given web page
def get_next_link(s):
    start_link = s.find("<a href")
    if start_link == -1:    #If no links are found then give an error!
        end_quote = 0
        link = "no_links"
        return link, end_quote
    else:
        start_quote = s.find('"', start_link)
        end_quote = s.find('"',start_quote+1)
        link = str(s[start_quote+1:end_quote])
        return link, end_quote

#Getting all links with the help of 'get_next_links'
def get_all_links(page):
    links = []
    while True:
        link, end_link = get_next_link(page)
        if link == "no_links":
            break
        else:
            links.append(link)      #Append all the links in the list named 'Links'
            #time.sleep(0.1)
            page = page[end_link:]
    return links 

In [9]:
#Season Appearance
def extract_season(page):
    season_path = seed_page + get_next_link(page[page.find('First appearance'):])[0]
    season = extract_info(download_page(season_path), 'Season', ',')[0]
    return re.sub("[^0-9]", "", season)

In [10]:
extract_season(page)

'1'

In [11]:
#Extract just the Introduction part of the page
def extract_introduction(page):
    start_introduction = page.find("<p>")
    stop_introduction = page.find('<div id="toctitle">', start_introduction + 1)
    #If the page onl has introduction
    if '<div id="toctitle">' not in page:
        stop_introduction = page.find('</p>', start_introduction + 1)
    else:
        pass    
    raw_introduction = page[start_introduction : stop_introduction]
    return raw_introduction

In [12]:
#Short Description (color of line)
extract_introduction(page)

'<p><br />\\n<b>Serena Celia Humphrey </b>(n\\xc3\\xa9e<b> van der Woodsen</b>) is a main character in the <a href="/wiki/Gossip_Girl_(novel_series)" title="Gossip Girl (novel series)"><i>Gossip Girl</i> series of novels</a> and its <a href="/wiki/Gossip_Girl_(TV_series)" title="Gossip Girl (TV series)">television adaption</a>, in which she is portrayed by <a href="/wiki/Blake_Lively" title="Blake Lively">Blake Lively</a>.\\n</p><p>Throughout the series, Serena has an <a href="/wiki/Dan-Serena_Relationship" title="Dan-Serena Relationship" class="mw-redirect">on-again, off-again relationship</a> with <a href="/wiki/Dan_Humphrey" title="Dan Humphrey">Dan Humphrey</a> and the two have the longest running relationship on the show. Dan and Serena finally get married in the <a href="/wiki/New_York,_I_Love_You_XOXO" title="New York, I Love You XOXO">series finale</a>.\\n</p><p>Serena is regarded as the "it girl" of the <a href="/wiki/Upper_East_Side" title="Upper East Side">Upper East Side</a

In [13]:
#Remove all the HTML tags from the introduction to get the pure text
#Eliminate all the text inside '<' & '>'
def extract_pure_text(page):
    pure_text = (re.sub(r'<.+?>', '', page))       #From '<' to the next '>'
    return pure_text

In [14]:
extract_pure_text(extract_introduction(page))

'\\nSerena Celia Humphrey (n\\xc3\\xa9e van der Woodsen) is a main character in the Gossip Girl series of novels and its television adaption, in which she is portrayed by Blake Lively.\\nThroughout the series, Serena has an on-again, off-again relationship with Dan Humphrey and the two have the longest running relationship on the show. Dan and Serena finally get married in the series finale.\\nSerena is regarded as the "it girl" of the Upper East Side. She is best friends with Blair Waldorf and also has a close friendship with Nate Archibald and Chuck Bass. \\nShe, Dan and Blair are the only characters that appear in every episode of the TV series.\\n\\n'

In [15]:
#Crawl Initiation
#Check for file type in URL so crawler does not crawl images and text files
def extension_scan(url):
    a = ['.png','.jpg','.jpeg','.gif','.tif','.txt']
    j = 0
    while j < (len(a)):
        if a[j] in url:
            #print("There!")
            flag2 = 1
            break
        else:
            #print("Not There!")
            flag2 = 0
            j = j+1
    #print(flag2)
    return flag2

In [49]:
# Extract for family
def find_relations(page, start_tag, end_tag):
    result = []
    start = page.find(start_tag)
    end = page[start:].find(end_tag)
    i = start
    while(i < start+end):
        link, j = get_next_link(page[i:])
        name = extract_info(page[i+j:], ">", "<")[0]
        relation, k = extract_info(page[i+j:], "(", ")")
        i += k+j
        res = [link, name, extract_pure_text(relation)]
        result+=[res]
    return result

In [51]:
find_relations(page, "Family</b></h3>", "Romances</b></h3>")

[['/wiki/Dan_Humphrey', 'Dan Humphrey', 'husband'],
 ['/wiki/William_van_der_Woodsen', 'William van der Woodsen', 'father'],
 ['/wiki/Lily_van_der_Woodsen', 'Lily van der Woodsen', 'mother'],
 ['/wiki/Eric_van_der_Woodsen', 'Eric van der Woodsen', 'brother'],
 ['/wiki/Jenny_Humphrey', 'Jenny Humphrey', 'sister-in-law, via Dan'],
 ['/wiki/Milo_Humphrey', 'Milo Humphrey', 'legal step-son, via Dan'],
 ['/wiki/Scott_Rosson',
  'Scott Rosson',
  'maternal half-brother/brother-in-law'],
 ['/wiki/Lola_Rhodes',
  'Charlotte "Lola" Rhodes',
  'paternal half-sister/cousin'],
 ['/wiki/Chuck_Bass', 'Chuck Bass', 'adoptive brother'],
 ['/wiki/CeCe_Rhodes', 'CeCe', 'maternal grandparents'],
 ['/wiki/Carol_Rhodes', 'Carol Rhodes', 'maternal aunt'],
 ['/wiki/Blair_Waldorf', 'Blair Waldorf', 'adoptive sister-in-law, via Chuck'],
 ['/wiki/Henry_Bass', 'Henry Bass', 'nephew, via Chuck and Blair'],
 ['/wiki/Alison_Humphrey', 'Alison Humphrey', 'mother-in-law, via Dan'],
 ['/wiki/Rufus_Humphrey', 'Rufus Hu

In [50]:
# Extract for Romances
find_relations(page, "Romances</b></h3>", "Friends</b></h3>")

[['/wiki/Dan_Humphrey', 'Dan Humphrey', 'husband'],
 ['/wiki/Nate_Archibald', 'Nate Archibald', 'ex-boyfriend'],
 ['/wiki/Aaron_Rose', 'Aaron Rose', 'ex-boyfriend'],
 ['/wiki/Gabriel_Edwards', 'Gabriel Edwards', 'ex-boyfriend'],
 ['/wiki/Carter_Baizen', 'Carter Baizen', 'ex-boyfriend'],
 ['/wiki/Ben_Donovan', 'Ben Donovan', 'ex-boyfriend'],
 ['/wiki/Steven_Spence', 'Steven Spence', 'ex-boyfriend'],
 ['/wiki/Tripp_van_der_Bilt', 'Tripp van der Bilt', 'affair'],
 ['/wiki/Colin_Forrester', 'Colin Forrester', 'fling'],
 ['/wiki/Max_Harding', 'Max Harding', 'kissed'],
 ['/wiki/Blair_Waldorf', 'Blair Waldorf', 'best friend']]

In [None]:
def find_people(page, start_tag, end_tag, relation):
    result = []
    start = page.find(start_tag)
    end = page[start:].find(end_tag)
    i = start
    while(i < start+end):
        link, j = get_next_link(page[i:])
        name, k = extract_info(page[i+j:], ">", "<")
        i += k+j
        res = [link, name, relation]
        result+=[res]
    return result

In [52]:
# Extract for friends
find_relations(page, "Friends</b></h3>", "Enemies</b></h3>", '')

[['/wiki/Blair_Waldorf', 'Blair Waldorf', 'best friend'],
 ['/wiki/Chuck_Bass', 'Chuck Bass', 'frenemy']]

In [45]:
# Extract for Enemies
find_relations(page, "Enemies</b></h3>", "</section>")

[['/wiki/Georgina_Sparks', 'Georgina Sparks', 'frenemy'],
 ['/wiki/Ivy_Dickens', 'Ivy Dickens', 'frenemy'],
 ['/wiki/Juliet_Sharp', 'Juliet Sharp', 'n\\xc3\\xa9e<b> van der Woodsen</b>']]

In [4]:
#URL parsing for incomplete or duplicate URLs
def url_parse(url):
    try:
        from urllib.parse import urlparse
    except ImportError:
        from urlparse import urlparse
    url = url  #.lower()    #Make it lower case
    s = urlparse(url)       #parse the given url
    seed_page_n = seed_page #.lower()       #Make it lower case
    #t = urlparse(seed_page_n)     #parse the seed page (reference page)
    i = 0
    flag = 0
    while i<=9:
        if url == "/":
            url = seed_page_n
            flag = 0  
        elif not s.scheme:
            url = "http://" + url
            flag = 0
        elif "#" in url:
            url = url[:url.find("#")]
            flag = 0
        elif "?" in url:
            url = url[:url.find("?")]
            flag = 0
        elif s.netloc == "":
            url = seed_page + s.path
            flag = 0
        #elif "www" not in url:
        #    url = "www."[:7] + url[7:]
        #    flag = 0
            
        elif url[len(url)-1] == "/":
            url = url[:-1]
            flag = 0
        #elif s.netloc != t.netloc:
        #    url = url
        #    flag = 1
        #    break        
        else:
            url = url
            flag = 0
            break
        
        i = i+1
        s = urlparse(url)   #Parse after every loop to update the values of url parameters
    return(url, flag)

In [5]:
t0 = time.time()
database = {}   #Create a dictionary

In [6]:
#Main Crawl function that calls all the above function and crawls the entire site sequentially
def web_crawl():  
    to_crawl = [starting_page]      #Define list name 'Seed Page'
    #print(to_crawl)
    crawled=[]      #Define list name 'Seed Page'
    #database = {}   #Create a dictionary
    #k = 0;
    for k in range(0, 3):
        i=0        #Initiate Variable to count No. of Iterations
        while i<3:     #Continue Looping till the 'to_crawl' list is not empty
            urll = to_crawl.pop(0)      #If there are elements in to_crawl then pop out the first element
            urll,flag = url_parse(urll)
            #print(urll)
            flag2 = extension_scan(urll)
            time.sleep(3)
            
            #If flag = 1, then the URL is outside the seed domain URL
            if flag == 1 or flag2 == 1:
                pass        #Do Nothing
                
            else:       
                if urll in crawled:     #Else check if the URL is already crawled
                    pass        #Do Nothing
                else:       #If the URL is not already crawled, then crawl i and extract all the links from it
                    print("Link = " + urll)
                    
                    raw_html = download_page(urll)
                    #print(raw_html)
                    
                    title_upper = str(extract_title(raw_html))
                    title = title_upper.lower()     #Lower title to match user queries
                    print("Title = " + title)
                     
                    raw_introduction = extract_introduction(raw_html)
                    #print("Raw Introduction = " + raw_introduction)
                    
                    to_crawl = to_crawl + get_all_links(raw_introduction)
                    crawled.append(urll)
                    
                    pure_introduction = extract_pure_text(raw_introduction)
                    print("Introduction = " + pure_introduction.replace('   ',' '))
                    
                    database [title] = pure_introduction        #Add title and its introduction to the dict
                    
                    #Writing the output data into a text file
                    file = open('database.txt', 'a')        #Open the text file called database.txt
                    file.write(title + ": " + "\n")         #Write the title of the page
                    file.write(pure_introduction + "\n\n")      #write the introduction of that page
                    file.close()                            #Close the file
                    
    
                    #Remove duplicated from to_crawl
                    n = 1
                    j = 0
                    #k = 0
                    while j < (len(to_crawl)-n):
                        if to_crawl[j] in to_crawl[j+1:(len(to_crawl)-1)]:
                            to_crawl.pop(j)
                            n = n+1
                        else:
                            pass     #Do Nothing
                        j = j+1
                i=i+1
                print(i)
                print(k)
                #print(to_crawl)
                #print("Iteration No. = " + str(i))
                #print("To Crawl = " + str(len(to_crawl)))
                #print("Crawled = " + str(len(crawled)))
    return ""

In [7]:
%%time
print (web_crawl())

Link = http://gossipgirl.wikia.com/wiki/Serena_van_der_Woodsen
Title = \n<html lang="en" dir="ltr" class="">\n<head>\n\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n<meta name="generator" content="mediawiki 1.19.24" />\n<meta name="keywords" content="gossip girl wiki,gossipgirl,serena van der woodsen,dan humphrey,william van der woodsen,lily van der woodsen,eric van der woodsen,jenny humphrey,milo humphrey,scott rosson,lola rhodes,chuck bass,cece rhodes" />\n<meta name="description" content="serena celia humphrey (n\xc3\xa9e van der woodsen) is a main character in the gossip girl series of novels and its television adaption, in which she is portrayed by blake lively. throughout the series, serena has an on-again, off-again relationship with dan humphrey and the two have the longest running..." />\n<meta name="twitter:card" content="summary" />\n<meta name="twitter:site" 

1
0
Link = http://gossipgirl.wikia.com//wiki/Gossip_Girl_(novel_series)
Title = \n<html lang="en" dir="ltr" class="">\n<head>\n\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n<meta name="generator" content="mediawiki 1.19.24" />\n<meta name="keywords" content="gossip girl wiki,gossipgirl,gossip girl (novel series),would i lie to you,blake lively,chace crawford,ed westwick,all i want is everything,because i\'m worth it,cecily von ziegesar,don\'t you forget about me,gossip girl,gossip girl: the carlyles" />\n<meta name="description" content="the gossip girl novel series was created by cecily von ziegesar and written by herself as well as by an unknown ghost writer. the name of the first novel in the series, gossip girl, is also the nom de plume of the narrator. it has also been adapted into a tv series airing on the cw and ctv. it..." />\n<meta name="twitter:card" content="

2
0
Link = http://gossipgirl.wikia.com//wiki/Gossip_Girl_(TV_series)
Title = \n<html lang="en" dir="ltr" class="">\n<head>\n\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n<meta name="generator" content="mediawiki 1.19.24" />\n<meta name="keywords" content="gossip girl wiki,gossipgirl,gossip girl (tv series),gossip girl (novel series),gossip girl (character),manhattan,serena van der woodsen,leighton meester,blair waldorf,penn badgley,dan humphrey,chace crawford,nate archibald" />\n<meta name="description" content="gossip girl is an american teen drama television series based on the book series of the same name written by cecily von ziegesar. the series, created by josh schwartz and stephanie savage, originally ran on the cw for six seasons from september 19, 2007, to december 17, 2012. narrated by the..." />\n<meta name="twitter:card" content="summary" />\n<meta name="twi

3
0
Link = http://gossipgirl.wikia.com//wiki/Blake_Lively
Title = \n<html lang="en" dir="ltr" class="">\n<head>\n\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n<meta name="generator" content="mediawiki 1.19.24" />\n<meta name="keywords" content="gossip girl wiki,gossipgirl,blake lively,serena van der woodsen,gossip girl (tv series),gossip girl (books),cecily von ziegesar,gossip girl,penn badgley" />\n<meta name="description" content="blake ellender brown (lively) is an american actress, (born on august 25, 1987.) she is best known for her role as serena van der woodsen in gossip girl. lively has also starred in such films as the sisterhood of the traveling pants (2005), accepted (2006), the private lives of pippa lee (2009..." />\n<meta name="twitter:card" content="summary" />\n<meta name="twitter:site" content="@getfandom" />\n<meta name="twitter:url" content="http://go

1
1
Link = http://gossipgirl.wikia.com//wiki/Dan-Serena_Relationship
Title = \n<html lang="en" dir="ltr" class="">\n<head>\n\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n<meta name="generator" content="mediawiki 1.19.24" />\n<meta name="keywords" content="gossip girl wiki,gossipgirl,dan\xe2\x80\x93serena relationship,pilot,summer kind of wonderful,in the realm of the basses,last tango, then paris,the kids are not alright,new york, i love you xoxo,carrnal knowledge,gaslit,much \'i do\' about nothing,the dark night" />\n<meta name="description" content="the relationship between dan and serena, also known as derena, is the friendship and romantic relationship between serena van der woodsen and dan humphrey. dan and serena met at a party thrown by blair waldorf that dan was accidentally invited to. they had a brief conversation that led to dan..." />\n<meta name="twitter:ca

2
1
Link = http://gossipgirl.wikia.com//wiki/New_York,_I_Love_You_XOXO
Title = \n<html lang="en" dir="ltr" class="">\n<head>\n\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n<meta name="generator" content="mediawiki 1.19.24" />\n<meta name="keywords" content="gossip girl wiki,gossipgirl,new york, i love you xoxo,the revengers,riding in town cars with boys,lola rhodes,juliet sharp,agnes andrews,henry bass,olivia burke,blake lively,serena van der woodsen,blair waldorf" />\n<meta name="description" content="new york, i love you xoxo is the 10th episode of the sixth season, the 121st episode overall, and the series finale. the upper east side was like something from fitzgerald or thackeray. teenagers acting like adults, adults acting like teenagers: guarding secrets, spreading gossip, all with the..." />\n<meta name="twitter:card" content="summary" />\n<meta name="twitter:sit

3
1
Link = http://gossipgirl.wikia.com//wiki/Nate_Archibald
Title = \n<html lang="en" dir="ltr" class="">\n<head>\n\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n<meta name="generator" content="mediawiki 1.19.24" />\n<meta name="keywords" content="gossip girl wiki,gossipgirl,nate archibald,howard archibald,anne archibald,william van der bilt,maureen van der bilt,tripp van der bilt,manhattan,new york city,st. jude\'s school for boys,the new york spectator,columbia university" />\n<meta name="description" content="nathaniel fitzwilliam &amp;quot;nate&amp;quot; archibald is a main character in the gossip girl series of novels and on its television adaption, in which he is portrayed by chace crawford. nate is best friends with chuck bass and is good friends with dan humphrey, serena van der woodsen and blair waldorf. nate..." />\n<meta name="twitter:card" content="summary" /

1
2
Link = http://gossipgirl.wikia.com//wiki/The_It_Girl
Title = \n<html lang="en" dir="ltr" class="">\n<head>\n\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n<meta name="generator" content="mediawiki 1.19.24" />\n<meta name="keywords" content="gossip girl wiki,gossipgirl,the it girl (book),the it girl (novel series),notorious" />\n<meta name="description" content="the it girl is the spin-off of gossip girl, the main character being jenny humphrey. it was created by cecily von ziegesar. you may recall from the last gossip girl book that jenny humphrey has left constance billard in nyc to go to a boarding school- waverly academy in upstate new york. waverly..." />\n<meta name="twitter:card" content="summary" />\n<meta name="twitter:site" content="@getfandom" />\n<meta name="twitter:url" content="http://gossipgirl.wikia.com/wiki/the_it_girl_(book)" />\n<meta name="twitter:

2
2
Link = http://gossipgirl.wikia.com//wiki/Gossip_Girl:_The_Carlyles
Title = \n<html lang="en" dir="ltr" class="">\n<head>\n\n<meta http-equiv="content-type" content="text/html; charset=utf-8">\n\t<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">\n<meta name="generator" content="mediawiki 1.19.24" />\n<meta name="keywords" content="gossip girl wiki,gossipgirl,gossip girl: the carlyles,annabelle vestry,gossip girl: the carlyles (novel series),you just can\'t get enough,cecily von ziegesar" />\n<meta name="description" content="gossip girl: the carlyles is the first book in the the carlyles novel series. this book was created by cecily von ziegesar and written by annabelle vestry. get out your platinum montblanc pens, chloe satchels, and cashmere cardigans: it\'s a brand new year on the upper east side and the notorious..." />\n<meta name="twitter:card" content="summary" />\n<meta name="twitter:site" content="@getfandom" />\n<meta name="twitter:u

3
2

Wall time: 43.2 s
