This script has the objective of downloading and keep traceability information about Piano MIDI files from the Mutopia Project site.

In [26]:
from scrapy import Selector, Spider, Request
from scrapy.pipelines.files import FilesPipeline
from scrapy.crawler import CrawlerRunner
from crochet import setup, wait_for
import glob
import re
import os

setup()

In [27]:
class MIDIDownloadPipeline(FilesPipeline):
    def file_path(self, request, response=None, info=None, *, item=None):
        media_ext = os.path.splitext(request.url)[1]
        media_name = self.remove_non_ascii(item.get('name'))
        return f'{media_name}{media_ext}'

    def remove_non_ascii(self, string):
        return ''.join([i if 47 < ord(i) < 58 or 64 < ord(i) < 91 or i == '_' \
            else '' for i in string.upper().replace(" ", "_")])

In [28]:
class MIDIDownloadAndTrack(Spider):
    name = "MIDI Download and Track in CSV"
    start_urls = ["https://www.mutopiaproject.org/cgibin/make-table.cgi?Instrument=Piano"]
    custom_settings = {
        'ITEM_PIPELINES': {
            '__main__.MIDIDownloadPipeline': 1,
        },
        'FILES_STORE': '../midi/downloaded_midi',
        'FEEDS': {
            "midi_info.csv": {
                'format': "csv",
                'overwrite': True,
            },
        },
    }
    
    def parse(self, response):
        page_cnt = 1 if 'startat' not in response.url else int(re.search('startat=(\\d+)', response.url).group(1))//10 + 1
        print(f"Parsing page {page_cnt}...")
        for midi in response.xpath("//table[contains(@class, 'result-table')]").getall():
            midi_info = Selector(text=midi).xpath("//td").getall()
            midi_url = [Selector(text=each).xpath("//@href").get() for each in midi_info if ".mid" in each][0]
            midi_info = [Selector(text=each).xpath("//text()").get() for each in midi_info]
            if not re.search('\\.mid$', midi_url):
                continue
            yield {
                "name": midi_info[0],
                "author": midi_info[1][3:],
                "instruments": midi_info[4][4:],
                "domain": midi_info[9],
                "file_urls": [midi_url]
                }
        next_page_url = response.xpath("//a[contains(text(), 'Next 10')]/@href").get()
        if next_page_url:
            yield Request(response.urljoin(next_page_url), self.parse)

In [29]:
@wait_for(5*60)
def run_spider():
    crawler = CrawlerRunner()
    ret = crawler.crawl(MIDIDownloadAndTrack)
    return ret

In [30]:
files = glob.glob('../midi/downloaded_midi/*')
for file in files:
    os.remove(file)
run_spider()

Parsing page 1...
Parsing page 2...
Parsing page 3...
Parsing page 4...
Parsing page 5...
Parsing page 6...
Parsing page 7...
Parsing page 8...
Parsing page 9...
Parsing page 10...
Parsing page 11...
Parsing page 12...
Parsing page 13...
Parsing page 14...
Parsing page 15...
Parsing page 16...
Parsing page 17...
Parsing page 18...
Parsing page 19...
Parsing page 20...
Parsing page 21...
Parsing page 22...
Parsing page 23...
Parsing page 24...
Parsing page 25...
Parsing page 26...
Parsing page 27...
Parsing page 28...
Parsing page 29...
Parsing page 30...
Parsing page 31...
Parsing page 32...
Parsing page 33...
Parsing page 34...
Parsing page 35...
Parsing page 36...
Parsing page 37...
Parsing page 38...
Parsing page 39...
Parsing page 40...
Parsing page 41...
Parsing page 42...
Parsing page 43...
Parsing page 44...
Parsing page 45...
Parsing page 46...
Parsing page 47...
Parsing page 48...
Parsing page 49...
Parsing page 50...
Parsing page 51...
Parsing page 52...
Parsing page 53...
Pa