In [1]:
# Scrapy implementation

import scrapy # Web Crawling and Web scraping modules
import logging # 
import pandas as pd # Dataframe processing
from scrapy.crawler import CrawlerProcess # A class to run multiple scrapy crawlers in a process simultaneously
import json # setting pipeline
import time

In [2]:
class JsonWriterPipeline(object):
    """
    To write extracted data to a JSON-line file.
    """

    def open_spider(self, spider):
        self.file = open('result.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [3]:
class StudiosSpider(scrapy.Spider):
    name = "studios"
    allowed_domains = ["myanimelist.net"]
    start_urls = ["https://myanimelist.net/anime/producer"]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING, #Logging level set to warning to avoid overload with DEBUG messages about the retrieved data
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        #'FEED_FORMAT':'json',                                 # Used for pipeline 2
        #'FEED_URI': 'result.json'                        # Used for pipeline 2
    }
    
    def parse(self, response):
        """
        Crawls the initial URL, that is where all producers are listed.
        """
        for href in response.xpath("//div[@class='genre-list al']/a/@href").extract():
            url = response.urljoin(href)
            #time.sleep(4) #only request pages every n seconds
            #req.meta['proxy'] = "http://yourproxy.com:80"
            time.sleep(4)
            req = scrapy.Request(url, callback=self.check_subpages)
            yield req
                
    def check_subpages(self, response):
        """
        Checks if there are subpages for producers with more than 100 anime productions
        to crawl through subpages.
        """
        subpages = response.xpath('//div[@class="pagination ac"]/a/@href').getall()
        if subpages:
            for subpage in subpages:
                next_page = response.urljoin(subpage)
                req = scrapy.Request(next_page, callback=self.parse_titles)
                yield req
        else:
            req = scrapy.Request(response.url, callback=self.parse_titles, dont_filter=True)
            yield req
            
           
    def parse_titles(self, response):
        """
        Get data of anime titles per studio, one title per record object.
        """
        for div in response.xpath('//div[@class="seasonal-anime js-seasonal-anime"]'):
            data = {}
            data['studio'] = response.xpath("//span[@class='di-ib mt4']/text()").get()
            data['title'] = div.xpath(".//p[@class='title-text']/a/text()").get()
            data['genre'] = div.xpath(".//div[@class='genres-inner js-genre-inner']/span/a/text()").getall()
            yield data 
        
                
                
    """     
    def parse_titles(self, response):
    """
    #Get data of anime titles per studio, all titles per record object.
    """
        for sel in response.css('html').getall():
        #for title in response.xpath("//p[@class='title-text']/a/text()").getall():
            data = {}
            data['studio'] = response.xpath("//span[@class='di-ib mt4']/text()").get()
            data['title'] = response.xpath("//p[@class='title-text']/a/text()").getall()
            #data['genre'] = response.xpath("//span[@class='genre']/a/text()").getall()
        yield data
    
    """

In [4]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/78.0.1' #add user agents to avoid getting banned
})

process.crawl(StudiosSpider)
process.start()

2020-08-02 02:53:00 [scrapy.utils.log] INFO: Scrapy 2.2.0 started (bot: scrapybot)
2020-08-02 02:53:00 [scrapy.utils.log] INFO: Versions: lxml 4.5.1.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.8.1 (default, Jan  8 2020, 22:29:32) - [GCC 7.3.0], pyOpenSSL 19.1.0 (OpenSSL 1.1.1g  21 Apr 2020), cryptography 2.9.2, Platform Linux-5.4.0-42-generic-x86_64-with-glibc2.10
2020-08-02 02:53:00 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2020-08-02 02:53:00 [scrapy.crawler] INFO: Overridden settings:
{'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/78.0.1'}


For around two hours of execution with time.sleep(4)

In [5]:
!tail -n 2 result.jl #show two last records in the file

{"studio": "Viz Media ", "title": "K: Seven Stories Movie 1 - R:B - Blaze", "genre": ["Action", "Super Power", "Supernatural", "Drama"]}
{"studio": "Viz Media ", "title": "Ayashi no Ceres", "genre": ["Adventure", "Comedy", "Horror", "Psychological", "Supernatural", "Drama", "Romance", "Shoujo"]}


In [6]:
ll result.* #check that the file exists in the directory

-rwxrwxrwx 1 alejandro 2830739 ago  2 00:55 [0m[01;32mresult.jl[0m*
-rw-rw-r-- 1 alejandro       0 jul 30 22:51 result.json
