In [1]:
import logging
logging.getLogger('scrapy').propagate = False

import json
from bs4 import BeautifulSoup

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import scrapy.crawler as crawler

import networkx as nx

import matplotlib.pyplot as plt
%matplotlib inline

SCRAPY_RESULT_FILE = "graph.json"
SCRAPY_PAGES_COUNT = 10000
SCRAPY_LINKS_LIMIT = 100

In [2]:
class CustomSpider(CrawlSpider):
    name = "custom_spider"
    
    allowed_domains = ["wikipedia.org"]
    start_urls = [
        "https://en.wikipedia.org/wiki/Information_retrieval",
        "https://en.wikipedia.org/wiki/Mein_Kampf",
        "https://en.wikipedia.org/wiki/Soviet_Union",
        "https://en.wikipedia.org/wiki/Nineteen_Eighty-Four",
        "https://en.wikipedia.org/wiki/The_Hero_with_a_Thousand_Faces",
    ]
    rules = (
        Rule(LinkExtractor(allow="https://.+\.wikipedia\.org/wiki/" + \
                                 "(?!(File|Talk|Category|Portal|Special|Wikipedia|Help|Draft|Main_Page)).+",
                           restrict_xpaths='//div[@id="mw-content-text"]',
                           canonicalize=True,
                           unique=True),
             process_links=lambda links: links[:SCRAPY_LINKS_LIMIT],
             callback="parse_item", 
             follow=True),
    )
    
    custom_settings = {
        "CLOSESPIDER_PAGECOUNT": SCRAPY_PAGES_COUNT,
        "CLOSESPIDER_ERRORCOUNT": 0,
        "CONCURRENT_REQUESTS": 16
    }

    def parse_item(self, response):
        try:
            title = response.css('h1#firstHeading.firstHeading::text').extract_first()
            snippet = BeautifulSoup(response.css('p').extract_first(), "lxml").text[:255] + "..."
            links = [lnk.url for rule in self._rules 
                     for lnk in rule.process_links(rule.link_extractor.extract_links(response))]
            return {'url': response.url, 'title': title, 'snippet': snippet, 'links': links}
        except:
            return None

In [3]:
runner = crawler.CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': SCRAPY_RESULT_FILE
})
runner.crawl(CustomSpider)
runner.start()

In [4]:
graph_json = json.load(open(SCRAPY_RESULT_FILE), encoding='utf-8')
G = nx.DiGraph()

nodes = {x["url"] for x in graph_json}
def get_edges():

    for line in graph_json:
        source, targets = line["url"], line["links"]

        for target in targets:
            if target in nodes:
                yield source, target


G.add_edges_from(get_edges())

In [5]:
print("Number of nodes: %d" % G.number_of_nodes())
print("Number of edges: %d" % G.number_of_edges())
print("Max out degree: %d" % max(deg for _, deg in G.out_degree(nodes)))
print("Max in degree: %d" % max(deg for _, deg in G.in_degree(nodes)))

Number of nodes: 9845
Number of edges: 40684
Max out degree: 21
Max in degree: 456


In [6]:
pagerank = nx.pagerank(G)
print("Max pagerank: %f" % max(pagerank.values()))
print("Min pagerank: %f" % min(pagerank.values()))

Max pagerank: 0.033433
Min pagerank: 0.000017


In [7]:
docs = {item["url"]: (item["title"], item["snippet"]) for item in graph_json}
search_result = sorted(pagerank.items(), key=lambda x: x[1], reverse=True)[:10]
for url, rank in search_result:
    title, snippet = docs[url]
    print("\033[95m\033[1m", title, rank, "\033[0m")
    print(url)
    print(snippet)
    print()

[95m[1m Integrated Authority File 0.03343322123641936 [0m
https://en.wikipedia.org/wiki/Integrated_Authority_File
The Integrated Authority File (German: Gemeinsame Normdatei, also known as: Universal Authority File) or GND is an international authority file for the organisation of personal names, subject headings and corporate bodies from catalogues. It is used mainl...

[95m[1m IMDb 0.019211378948140727 [0m
https://en.wikipedia.org/wiki/IMDb
IMDb, also known as Internet Movie Database, is an online database of information related to world films, television programs, home videos and video games, and internet streams, including cast, production crew, personnel and fictional character biographie...

[95m[1m CBS 0.004939255254926932 [0m
https://en.wikipedia.org/wiki/CBS
CBS (an initialism of the network's former name, the Columbia Broadcasting System) is an American English language commercial broadcast television network that is a flagship property of CBS Corporation. The compan

In [8]:
hits = nx.hits(G)
print("Max hits: %f" % max(hits[1].values()))
print("Min hits: %f" % min(hits[1].values()))

Max hits: 0.154157
Min hits: 0.000000


In [9]:
docs = {item["url"]: (item["title"], item["snippet"]) for item in graph_json}
search_result = sorted(hits[1].items(), key=lambda x: x[1], reverse=True)[:10]
for url, rank in search_result:
    title, snippet = docs[url]
    print("\033[95m\033[1m", title, rank, "\033[0m")
    print(url)
    print(snippet)
    print()

[95m[1m Integrated Authority File 0.1541569303170916 [0m
https://en.wikipedia.org/wiki/Integrated_Authority_File
The Integrated Authority File (German: Gemeinsame Normdatei, also known as: Universal Authority File) or GND is an international authority file for the organisation of personal names, subject headings and corporate bodies from catalogues. It is used mainl...

[95m[1m IMDb 0.058465548616940394 [0m
https://en.wikipedia.org/wiki/IMDb
IMDb, also known as Internet Movie Database, is an online database of information related to world films, television programs, home videos and video games, and internet streams, including cast, production crew, personnel and fictional character biographie...

[95m[1m Bibsys 0.028533962142480322 [0m
https://en.wikipedia.org/wiki/BIBSYS
BIBSYS is an administrative agency set up and organized by the Ministry of Education and Research in Norway. They are a service provider, focusing on the exchange, storage and retrieval of data pertaining to 