In [1]:
import logging
logging.getLogger('scrapy').propagate = False

import json

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import scrapy.crawler as crawler

import networkx as nx

In [2]:
class CustomSpider(CrawlSpider):
    name = "custom_spider"
    
    allowed_domains = ["wikipedia.org"]
    start_urls = [
        "https://en.wikipedia.org/wiki/Information_retrieval",
        "https://en.wikipedia.org/wiki/Mein_Kampf",
        "https://en.wikipedia.org/wiki/Soviet_Union",
        "https://en.wikipedia.org/wiki/Nineteen_Eighty-Four",
        "https://en.wikipedia.org/wiki/The_Hero_with_a_Thousand_Faces",
    ]
    rules = (
        Rule(LinkExtractor(allow="https://.+\.wikipedia\.org/wiki/" + \
                                 "(?!((File|Talk|Category|Portal|Special|Wikipedia|Help|Draft):|Main_Page)).+",
                                 restrict_xpaths='(//div[@id="mw-content-text"][position() < 100])'),
             callback="parse_item", follow=True),
    )
    
    custom_settings = {
        "CLOSESPIDER_PAGECOUNT": 50,
        "CLOSESPIDER_ERRORCOUNT": 0,
        "CONCURRENT_REQUESTS": 16
    }

    def parse_item(self, response):
        links = [lnk.url for rule in self._rules for lnk in rule.link_extractor.extract_links(response)]
        return {'url': response.url, 'links': links}

In [3]:
runner = crawler.CrawlerProcess({
    'FEED_FORMAT': 'json',
    'FEED_URI': 'result.json'
})
runner.crawl(CustomSpider)
runner.start()

In [5]:
graph_json = json.load(open("result.json"))
G = nx.Graph()
nodes = {}

def get_edges():
    i = 0
    for line in graph_json:
        source, targets = line["url"], line["links"]
        if nodes.get(source) is None:
            nodes[source] = i
            i += 1

        for target in targets:
            if nodes.get(target) is None:
                nodes[target] = i
                i += 1

            yield nodes[source], nodes[target]


G.add_edges_from(get_edges())

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
nx.draw(G, with_labels=True, pos=nx.kamada_kawai_layout(G))