In [17]:
import logging
logging.getLogger('scrapy').propagate = False

import json

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import scrapy.crawler as crawler

import networkx as nx

In [2]:
class CustomLinkExtractor(LinkExtractor):
    def extract_links(self, response):
        links = super().extract_links(response)
        return [(link, response.url) for link in links]


class CustomSpider(CrawlSpider):
    name = "custom_spider"
    
    allowed_domains = ["wikipedia.org"]
    start_urls = [
        "https://en.wikipedia.org/wiki/Information_retrieval",
        "https://en.wikipedia.org/wiki/Mein_Kampf",
        "https://en.wikipedia.org/wiki/Soviet_Union",
        "https://en.wikipedia.org/wiki/Nineteen_Eighty-Four",
        "https://en.wikipedia.org/wiki/The_Hero_with_a_Thousand_Faces",
    ]
    rules = (
        Rule(CustomLinkExtractor(allow="https://.+\.wikipedia\.org/wiki/" + \
                                       "(?!((File|Talk|Category|Portal|Special|Wikipedia|Help|Draft):|Main_Page)).+",
                                 restrict_xpaths='(//div[@id="mw-content-text"][position() < 100])'),
             callback="parse_item", follow=True),
    )
    
    custom_settings = {
        "CLOSESPIDER_PAGECOUNT": 1000,
        "CLOSESPIDER_ERRORCOUNT": 0,
        "CONCURRENT_REQUESTS": 16
    }


    def _build_request(self, rule, link):
        r = super()._build_request(rule, link[0])
        r.meta.update(parent_url=link[1])
        return r
 
    def parse_item(self, response):
        return {'url': response.url, 'parent_url': response.request.meta.get("parent_url", "")}

In [3]:
runner = crawler.CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'result.json'
})
runner.crawl(CustomSpider)
runner.start()

In [24]:
graph_json = json.load(open("result.json"))
G = nx.Graph()
nodes = {}

def get_edges():
    i = 0
    for edge in graph_json:
        source, target = edge["parent_url"], edge["url"]
        if nodes.get(source) is None:
            nodes[source] = i
            i += 1

        if nodes.get(target) is None:
            nodes[target] = i
            i += 1
    
        yield nodes[source], nodes[target]


G.add_edges_from(get_edges())

In [None]:
%matplotlib inline
nx.draw(G, with_labels=True, font_weight='bold')