## MuseScore Scraper

Tool: https://github.com/frankye8998/MusicalMusic  

[Reference 2](https://github.com/codeandproduce/music_research_dataset_midi/blob/9a5afe86f1f2d3fdbfa24de99122140e7a7cd60c/MuseScore/MuseScore%20Scraping%20Main.ipynb)

In [1]:
from lxml import html
import urllib
import requests
import shutil

In [2]:
from random import randint
from time import sleep

In [3]:
import json
from pathlib import Path
from tqdm import tqdm
import time

In [4]:
# links = []
# url = "https://musescore.com/hub/piano/solo-piano?sort=view_count"
# # url = "https://musescore.com/hub/piano/piano-duet?sort=view_count"
# page = requests.get(url)
# tree = html.fromstring(page.content)
# print(tree.xpath("//title/text()"))

In [5]:
# articles = tree.xpath("//article[@role='article']")

In [6]:
def get_data(a):
    score = a.find("h2//a[@rel='bookmark']")
    link = score.attrib['href']
    title = score.text_content().strip()
    author = a.find("div[@class='user']//a").text_content()
    metadata = a.find("div[@class='meta']").text_content().split('•\n')
    parts, pages, duration, time, views = [m.strip() for m in metadata]
    return {
        'link': "https://musescore.com" + link,
        'score_id': Path(link).name,
        'title': title,
        'author': author,
        'parts': parts,
        'pages': pages,
        'duration': duration,
        'views': views
    }

## But that's actually just page one and there are pages 2,3,4,5... 100!
### So let's make a more general format:

In [8]:
# url_default = "https://musescore.com/hub/video_games/movie?sort=view_count&page=" # + the page number
url_default = 'https://musescore.com/hub/piano/easy-piano?sort=view_count&page='
# url_default = "https://musescore.com/hub/piano/solo-piano?sort=view_count&page=" # + the page number
# url_default = "https://musescore.com/hub/piano/voice-piano?sort=view_count&page=" # + the page number
# url_default = "https://musescore.com/hub/piano?sort=view_count&page=" # + the page number

In [9]:
json_file = Path('musescore_piano_easy.json')

In [10]:
if json_file.exists():
    with open(json_file, 'r') as fp:
        links = json.load(fp)
        

In [11]:
if not json_file.exists():
    links = []
    for page in tqdm(range(1,100), total=100):
        page = requests.get(url_default+str(page)) # https://musescore.com/hub/piano?page=1,2,3,4,5,...,10
        tree = html.fromstring(page.content)

        articles = tree.xpath("//article[@role='article']")

        links.extend([get_data(a) for a in articles])

#         sleep(randint(1,4))
    with open(json_file, 'w') as fp:
        json.dump(links, fp)
print(links[:100])

 99%|█████████▉| 99/100 [01:07<00:00,  1.80it/s]

[{'link': 'https://musescore.com/user/158751/scores/2163051', 'score_id': '2163051', 'title': 'Game of Thrones, Easy piano', 'author': 'lucky37', 'parts': '1 part', 'pages': '3 pages', 'duration': '01:39', 'views': '448,392 views'}, {'link': 'https://musescore.com/user/2466621/scores/2100881', 'score_id': '2100881', 'title': 'Requiem for a Dream (Easy)', 'author': 'Torby Brand', 'parts': '1 part', 'pages': '2 pages', 'duration': '01:48', 'views': '137,403 views'}, {'link': 'https://musescore.com/user/13543696/scores/4836661', 'score_id': '4836661', 'title': 'Someone Like You (easy piano)', 'author': 'matikavi11869', 'parts': '1 part', 'pages': '6 pages', 'duration': '04:48', 'views': '127,567 views'}, {'link': 'https://musescore.com/user/2500616/scores/2311476', 'score_id': '2311476', 'title': 'Star Wars Theme - easy piano', 'author': 'Erin Keely Whitworth Machado', 'parts': '1 part', 'pages': '1 page', 'duration': '00:58', 'views': '126,604 views'}, {'link': 'https://musescore.com/use




# Scrape links found

### Musical Music Lib

In [12]:

class MuseScoreException(Exception):
    pass

class InvalidFileExtension(MuseScoreException):
    pass

class InvalidScoreID(MuseScoreException):
    pass

class InvalidCredentials(MuseScoreException):
    pass

class InvalidSearchSort(MuseScoreException):
    pass

In [13]:
import urllib
import urllib.request


import bs4
import requests

class MusicalMusic:
    """Musescore actions requiring an account."""

    def __init__(self, username, password):
        self.username = username
        url = "https://musescore.com/user/login"
        r = requests.get(url)
        soup = bs4.BeautifulSoup(r.text, "html.parser")
        csrf = soup.find("meta", {"name": "csrf-token"})["content"]
        url = "https://musescore.com/user/auth/login/process"
        cookies = {
            "mu_browser_uni": r.cookies['mu_browser_uni'],
            "_csrf": r.cookies["_csrf"]
        }
        data = {
          "username": username,
          "password": password,
          "_csrf": csrf,
          "op": "Log in"
        }
        try:
            mu_user = requests.post(url,
                                    data=data,
                                    cookies=cookies,
                                    allow_redirects=False,
                                    ).cookies["mu_user_new"]
        except KeyError as e:
            raise InvalidCredentials(
                "Please check your username and password!") from e

        mu_browser_uni = r.cookies['mu_browser_uni']

        self.mu_browser_uni = mu_browser_uni
        self.mu_user = mu_user

    def retrieve(self, id, format="pdf"):
        """Retrieves Musescore data in bytes"""

        if format not in ["mp3", "pdf", "mid", "mxl", "mscz"]:
            raise InvalidFileExtension("Must be mp3, pdf, mid, mxl, or mscz.")
        newlink = f"https://musescore.com/score/{id}/download/{format}"
        cookies = {"mu_browser_uni": self.mu_browser_uni,
                   "mu_user_new": self.mu_user}
        bytes = requests.get(newlink, cookies=cookies, verify=False)
        if bytes.status_code != 200:
            raise InvalidScoreID(str(bytes.status_code))
        return bytes.content

    def download(self, id, filename, format="mp3", proxy=None):
        if format not in ["mp3", "pdf", "mid", "mxl", "mscz"]:
            raise InvalidFileExtension("Must be mp3, pdf, mid, mxl, or mscz.")
        newlink = f"https://musescore.com/score/{id}/download/{format}"
        if proxy:
            proxy_support = urllib.request.ProxyHandler({'https' : proxy})
            opener = urllib.request.build_opener(proxy_support)
        else: opener = urllib.request.build_opener()
            
        cookieString = f"mu_browser_uni={self.mu_browser_uni};" \
                        f"mu_user_new={self.mu_user}"
        opener.addheaders = [("cookie", cookieString)]
        urllib.request.install_opener(opener)
        try:
            urllib.request.urlretrieve(newlink, filename)
        except urllib.error.HTTPError as e:
            raise e


### Trying proxy

https://codelike.pro/create-a-crawler-with-rotating-ip-proxy-in-python/

In [14]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import random

ua = UserAgent() # From here we generate a random user agent
proxies = [] # Will contain proxies [ip, port]

In [15]:
# Main function
  # Retrieve latest proxies
proxies_req = Request('https://www.sslproxies.org/')
proxies_req.add_header('User-Agent', ua.random)
proxies_doc = urlopen(proxies_req).read().decode('utf8')

soup = BeautifulSoup(proxies_doc, 'html.parser')
proxies_table = soup.find(id='proxylisttable')

# Save proxies in the array
for row in proxies_table.tbody.find_all('tr'):
    proxies.append({
      'ip':   row.find_all('td')[0].string,
      'port': row.find_all('td')[1].string
    })

### More proxies

https://github.com/constverum/ProxyBroker

In [16]:
import asyncio
from proxybroker import Broker

more_proxies = []

async def show(proxy_queue):
    while True:
        proxy = await proxy_queue.get()
        if proxy is None: break
        print('Found proxy: %s' % proxy)
        more_proxies.append({ 'ip': proxy.host, 'port': proxy.port })

proxy_queue = asyncio.Queue()
broker = Broker(proxy_queue)
tasks = await asyncio.gather(
    broker.find(types=['HTTPS'], limit=200),
    show(proxy_queue))

Found proxy: <Proxy FR 0.15s [HTTPS] 54.39.97.250:3128>
Found proxy: <Proxy EC 0.28s [HTTPS] 186.46.220.117:80>
Found proxy: <Proxy MX 0.32s [HTTPS] 187.216.93.20:53281>
Found proxy: <Proxy US 0.33s [HTTPS] 173.249.0.209:3128>
Found proxy: <Proxy DE 0.35s [HTTPS] 94.130.126.94:8008>
Found proxy: <Proxy JP 0.36s [HTTPS] 160.16.52.185:3128>
Found proxy: <Proxy SK 0.38s [HTTPS] 185.152.112.18:37900>
Found proxy: <Proxy RU 0.36s [HTTPS] 94.242.59.245:1448>
Found proxy: <Proxy PL 0.41s [HTTPS] 78.11.118.157:3128>
Found proxy: <Proxy RU 0.38s [HTTPS] 94.242.59.135:1448>
Found proxy: <Proxy RU 0.39s [HTTPS] 82.138.23.141:3128>
Found proxy: <Proxy BR 0.40s [HTTPS] 200.255.122.170:8080>
Found proxy: <Proxy SE 0.43s [HTTPS] 46.246.38.90:3128>
Found proxy: <Proxy RU 0.44s [HTTPS] 185.41.112.29:57190>
Found proxy: <Proxy PL 0.45s [HTTPS] 91.218.63.196:53281>
Found proxy: <Proxy -- 0.45s [HTTPS] 85.209.163.68:8080>
Found proxy: <Proxy US 0.48s [HTTPS] 68.183.180.184:8080>
Found proxy: <Proxy UA 0.4

Found proxy: <Proxy BR 0.61s [HTTPS] 181.191.180.110:8080>
Found proxy: <Proxy NL 0.28s [HTTPS] 176.56.236.158:3128>
Found proxy: <Proxy ES 2.39s [HTTPS] 178.60.28.98:9999>
Found proxy: <Proxy MX 2.41s [HTTPS] 200.66.94.147:8080>
Found proxy: <Proxy PL 0.27s [HTTPS] 145.239.87.173:3128>
Found proxy: <Proxy ID 0.48s [HTTPS] 139.255.25.83:3128>
Found proxy: <Proxy -- 0.39s [HTTPS] 45.71.38.77:999>
Found proxy: <Proxy CZ 2.50s [HTTPS] 85.207.44.10:53038>
Found proxy: <Proxy ID 0.66s [HTTPS] 103.228.117.244:8080>
Found proxy: <Proxy BR 0.69s [HTTPS] 138.219.223.166:3128>
Found proxy: <Proxy IN 0.98s [HTTPS] 45.250.226.10:8080>
Found proxy: <Proxy RU 2.15s [HTTPS] 176.196.84.138:51336>
Found proxy: <Proxy TH 0.59s [HTTPS] 202.28.17.5:8080>
Found proxy: <Proxy TH 2.58s [HTTPS] 180.180.156.35:37463>
Found proxy: <Proxy RU 1.72s [HTTPS] 185.5.19.234:52975>
Found proxy: <Proxy ID 1.45s [HTTPS] 103.76.15.138:47847>
Found proxy: <Proxy BY 0.38s [HTTPS] 86.57.219.179:23500>
Found proxy: <Proxy FR 

### Random proxy

In [17]:
all_proxies = proxies + more_proxies
# all_proxies = more_proxies
# Retrieve a random index proxy (we need the index to delete it if not working)
def random_proxy():
    return random.randint(0, len(all_proxies) - 1)

proxy = None
# # Choose a random proxy
# proxy_index = random_proxy()
# proxy = all_proxies[proxy_index]
# proxy_url = proxy['ip'] + ':' + str(proxy['port']); proxy_url

## Actual scraping

In [18]:
invalid_ids = []

In [19]:
deleted_proxies = []

In [20]:
with open('accounts.json', 'r') as fp:
    accounts = json.load(fp)

instances = [MusicalMusic(username, password) for (username, password) in accounts]
instance = None

In [37]:
for idx,link in enumerate(tqdm(links, total=len(links))):
    score_id = link['score_id']
    out_file = f"data/{score_id}.mxl"
    if Path(out_file).exists() or score_id in invalid_ids: continue
    # Every 10 requests, generate a new proxy
    if idx % 10 == 0 or instance is None:
        instance_index = random.randint(0, len(instances) - 1)
        instance = instances[instance_index]
    try:
        print('Downloading score id:', score_id)
        instance.download(score_id, out_file, format='mxl')
    except Exception as e:
        print('Could not download id:', score_id)
        instance = None
    sleep(randint(1,2))
    







  0%|          | 0/762 [00:00<?, ?it/s][A[A[A[A[A[A

Downloading score id: 2109666








 73%|███████▎  | 553/762 [00:02<00:00, 254.34it/s][A[A[A[A[A[A

Downloading score id: 1900301
Downloading score id: 1826561
Downloading score id: 5392562
Downloading score id: 1839176
Downloading score id: 5118647








 73%|███████▎  | 553/762 [00:13<00:00, 254.34it/s][A[A[A[A[A[A





 73%|███████▎  | 558/762 [00:16<02:50,  1.19it/s] [A[A[A[A[A[A

Downloading score id: 5006323








 73%|███████▎  | 559/762 [00:18<04:51,  1.43s/it][A[A[A[A[A[A

Downloading score id: 5007206








 73%|███████▎  | 560/762 [00:20<05:01,  1.49s/it][A[A[A[A[A[A

Downloading score id: 4928589








 74%|███████▎  | 561/762 [00:23<06:18,  1.88s/it][A[A[A[A[A[A

Downloading score id: 5158849








 74%|███████▍  | 562/762 [00:26<07:04,  2.12s/it][A[A[A[A[A[A

Downloading score id: 1573661








 74%|███████▍  | 563/762 [00:28<07:35,  2.29s/it][A[A[A[A[A[A

Downloading score id: 1071121








 74%|███████▍  | 564/762 [00:31<08:24,  2.55s/it][A[A[A[A[A[A

Downloading score id: 5363593








 74%|███████▍  | 565/762 [00:34<08:32,  2.60s/it][A[A[A[A[A[A

Downloading score id: 5458574








 74%|███████▍  | 566/762 [00:38<10:00,  3.06s/it][A[A[A[A[A[A

Downloading score id: 5172806








 74%|███████▍  | 567/762 [00:40<08:56,  2.75s/it][A[A[A[A[A[A

Downloading score id: 4418816








 75%|███████▍  | 568/762 [00:42<07:53,  2.44s/it][A[A[A[A[A[A

Downloading score id: 4362441








 75%|███████▍  | 569/762 [00:45<08:04,  2.51s/it][A[A[A[A[A[A

Downloading score id: 4878452








 75%|███████▍  | 570/762 [00:46<07:23,  2.31s/it][A[A[A[A[A[A

Downloading score id: 5250937








 75%|███████▍  | 571/762 [00:49<07:12,  2.27s/it][A[A[A[A[A[A

Downloading score id: 3494796








 75%|███████▌  | 572/762 [00:52<08:00,  2.53s/it][A[A[A[A[A[A

Downloading score id: 5239384








 75%|███████▌  | 573/762 [00:55<09:04,  2.88s/it][A[A[A[A[A[A

Downloading score id: 5494920








 75%|███████▌  | 574/762 [00:58<08:20,  2.66s/it][A[A[A[A[A[A

Downloading score id: 5181465








 75%|███████▌  | 575/762 [00:59<07:23,  2.37s/it][A[A[A[A[A[A

Downloading score id: 910916








 76%|███████▌  | 576/762 [01:01<06:40,  2.15s/it][A[A[A[A[A[A

Downloading score id: 5387107








 76%|███████▌  | 577/762 [01:05<08:03,  2.62s/it][A[A[A[A[A[A

Downloading score id: 4536821








 76%|███████▌  | 578/762 [01:06<07:06,  2.32s/it][A[A[A[A[A[A

Downloading score id: 1578381








 76%|███████▌  | 579/762 [01:08<06:54,  2.27s/it][A[A[A[A[A[A

Downloading score id: 2820751








 76%|███████▌  | 580/762 [01:10<06:19,  2.08s/it][A[A[A[A[A[A

Downloading score id: 3752566








 76%|███████▌  | 581/762 [01:13<06:49,  2.26s/it][A[A[A[A[A[A

Downloading score id: 2837776








 76%|███████▋  | 582/762 [01:14<06:12,  2.07s/it][A[A[A[A[A[A

Downloading score id: 5327585








 77%|███████▋  | 583/762 [01:16<05:48,  1.95s/it][A[A[A[A[A[A

Downloading score id: 1822636








 77%|███████▋  | 584/762 [01:19<06:24,  2.16s/it][A[A[A[A[A[A

Downloading score id: 851971








 77%|███████▋  | 585/762 [01:20<06:01,  2.04s/it][A[A[A[A[A[A

Downloading score id: 5500093








 77%|███████▋  | 586/762 [01:24<06:57,  2.37s/it][A[A[A[A[A[A

Downloading score id: 2911066








 77%|███████▋  | 588/762 [01:25<05:33,  1.92s/it][A[A[A[A[A[A

Downloading score id: 1379901








 77%|███████▋  | 589/762 [01:28<06:11,  2.15s/it][A[A[A[A[A[A

Downloading score id: 5352597








 78%|███████▊  | 591/762 [01:30<04:59,  1.75s/it][A[A[A[A[A[A

Downloading score id: 193837








 78%|███████▊  | 592/762 [01:32<05:45,  2.03s/it][A[A[A[A[A[A

Downloading score id: 5232646








 78%|███████▊  | 593/762 [01:34<05:24,  1.92s/it][A[A[A[A[A[A

Downloading score id: 4481046








 78%|███████▊  | 594/762 [01:36<05:08,  1.84s/it][A[A[A[A[A[A

Downloading score id: 5438400








 78%|███████▊  | 595/762 [01:37<04:57,  1.78s/it][A[A[A[A[A[A

Downloading score id: 4602211








 78%|███████▊  | 596/762 [01:39<04:47,  1.73s/it][A[A[A[A[A[A

Downloading score id: 5500100








 78%|███████▊  | 597/762 [01:43<06:20,  2.31s/it][A[A[A[A[A[A

Downloading score id: 2461436








 78%|███████▊  | 598/762 [01:46<06:59,  2.56s/it][A[A[A[A[A[A

Downloading score id: 5319389








 79%|███████▊  | 599/762 [01:48<06:35,  2.43s/it][A[A[A[A[A[A

Downloading score id: 191909








 79%|███████▊  | 600/762 [01:52<07:57,  2.94s/it][A[A[A[A[A[A

Downloading score id: 5452273








 79%|███████▉  | 601/762 [01:55<07:41,  2.87s/it][A[A[A[A[A[A

Downloading score id: 586036








 79%|███████▉  | 602/762 [01:57<07:04,  2.65s/it][A[A[A[A[A[A

Downloading score id: 4295881








 79%|███████▉  | 603/762 [01:59<06:20,  2.39s/it][A[A[A[A[A[A

Downloading score id: 5410643








 79%|███████▉  | 604/762 [02:00<05:43,  2.17s/it][A[A[A[A[A[A

Downloading score id: 4832971








 79%|███████▉  | 605/762 [02:02<05:15,  2.01s/it][A[A[A[A[A[A

Downloading score id: 4146106








 80%|███████▉  | 606/762 [02:05<05:43,  2.20s/it][A[A[A[A[A[A

Downloading score id: 4229476








 80%|███████▉  | 607/762 [02:07<05:39,  2.19s/it][A[A[A[A[A[A

Downloading score id: 1612716








 80%|███████▉  | 608/762 [02:10<06:22,  2.48s/it][A[A[A[A[A[A

Downloading score id: 4041641








 80%|███████▉  | 609/762 [02:15<08:29,  3.33s/it][A[A[A[A[A[A

Downloading score id: 2635441








 80%|████████  | 610/762 [02:18<07:53,  3.12s/it][A[A[A[A[A[A

Downloading score id: 3982866








 80%|████████  | 612/762 [02:19<06:05,  2.43s/it][A[A[A[A[A[A

Downloading score id: 2454721








 80%|████████  | 613/762 [02:22<06:12,  2.50s/it][A[A[A[A[A[A

Downloading score id: 5132885








 81%|████████  | 614/762 [02:24<05:53,  2.39s/it][A[A[A[A[A[A

Downloading score id: 5512267








 81%|████████  | 615/762 [02:28<06:49,  2.79s/it][A[A[A[A[A[A

Downloading score id: 3995671








 81%|████████  | 616/762 [02:30<06:17,  2.59s/it][A[A[A[A[A[A

Downloading score id: 1376411








 81%|████████  | 617/762 [02:33<06:37,  2.74s/it][A[A[A[A[A[A

Downloading score id: 667316








 81%|████████  | 618/762 [02:36<06:32,  2.72s/it][A[A[A[A[A[A

Downloading score id: 5365561








 81%|████████  | 619/762 [02:38<05:43,  2.40s/it][A[A[A[A[A[A

Downloading score id: 5392150








 81%|████████▏ | 620/762 [02:39<05:10,  2.19s/it][A[A[A[A[A[A

Downloading score id: 1581716








 81%|████████▏ | 621/762 [02:41<05:05,  2.16s/it][A[A[A[A[A[A

Downloading score id: 3703041








 82%|████████▏ | 622/762 [02:45<05:45,  2.46s/it][A[A[A[A[A[A

Downloading score id: 5484393








 82%|████████▏ | 623/762 [02:46<05:15,  2.27s/it][A[A[A[A[A[A

Downloading score id: 5275558








 82%|████████▏ | 624/762 [02:48<04:46,  2.08s/it][A[A[A[A[A[A

Downloading score id: 4165286








 82%|████████▏ | 625/762 [02:50<04:36,  2.02s/it][A[A[A[A[A[A

Downloading score id: 5425643








 82%|████████▏ | 626/762 [02:52<04:59,  2.20s/it][A[A[A[A[A[A

Downloading score id: 5484091








 82%|████████▏ | 627/762 [02:55<05:14,  2.33s/it][A[A[A[A[A[A

Downloading score id: 5370025








 82%|████████▏ | 628/762 [03:00<06:49,  3.06s/it][A[A[A[A[A[A

Downloading score id: 4808174








 83%|████████▎ | 629/762 [03:02<05:58,  2.70s/it][A[A[A[A[A[A

Downloading score id: 5117305








 83%|████████▎ | 630/762 [03:04<05:34,  2.54s/it][A[A[A[A[A[A

Downloading score id: 5180187








 83%|████████▎ | 631/762 [03:07<06:06,  2.80s/it][A[A[A[A[A[A

Downloading score id: 5350730








 83%|████████▎ | 632/762 [03:09<05:26,  2.51s/it][A[A[A[A[A[A

Downloading score id: 5151376








 83%|████████▎ | 634/762 [03:11<04:26,  2.08s/it][A[A[A[A[A[A

Downloading score id: 4128786








 83%|████████▎ | 635/762 [03:14<04:44,  2.24s/it][A[A[A[A[A[A

Downloading score id: 5176017








 83%|████████▎ | 636/762 [03:17<04:59,  2.38s/it][A[A[A[A[A[A

Downloading score id: 2450036








 84%|████████▎ | 637/762 [03:20<05:25,  2.61s/it][A[A[A[A[A[A

Downloading score id: 5173283








 84%|████████▎ | 638/762 [03:23<05:34,  2.70s/it][A[A[A[A[A[A

Downloading score id: 4832468








 84%|████████▍ | 639/762 [03:25<05:30,  2.68s/it][A[A[A[A[A[A

Downloading score id: 2869466








 84%|████████▍ | 640/762 [03:28<05:27,  2.68s/it][A[A[A[A[A[A

Downloading score id: 5116307








 84%|████████▍ | 641/762 [03:30<04:46,  2.36s/it][A[A[A[A[A[A

Downloading score id: 5361249








 84%|████████▍ | 642/762 [03:31<04:18,  2.16s/it][A[A[A[A[A[A

Downloading score id: 4122961








 84%|████████▍ | 643/762 [03:34<04:37,  2.33s/it][A[A[A[A[A[A

Downloading score id: 5304492








 85%|████████▍ | 644/762 [03:37<04:46,  2.42s/it][A[A[A[A[A[A

Downloading score id: 5474979








 85%|████████▍ | 645/762 [03:40<05:10,  2.66s/it][A[A[A[A[A[A

Downloading score id: 5196966








 85%|████████▍ | 646/762 [03:43<05:18,  2.74s/it][A[A[A[A[A[A

Downloading score id: 5483273








 85%|████████▍ | 647/762 [03:45<04:43,  2.46s/it][A[A[A[A[A[A

Downloading score id: 5483925








 85%|████████▌ | 648/762 [03:48<05:20,  2.81s/it][A[A[A[A[A[A

Downloading score id: 5179400








 85%|████████▌ | 649/762 [03:52<06:02,  3.20s/it][A[A[A[A[A[A

Downloading score id: 5032987








 85%|████████▌ | 650/762 [03:54<05:05,  2.73s/it][A[A[A[A[A[A

Downloading score id: 5440592








 85%|████████▌ | 651/762 [03:57<05:08,  2.78s/it][A[A[A[A[A[A

Downloading score id: 5306545








 86%|████████▌ | 652/762 [03:59<05:00,  2.74s/it][A[A[A[A[A[A

Downloading score id: 4986692








 86%|████████▌ | 653/762 [04:02<04:54,  2.71s/it][A[A[A[A[A[A

Downloading score id: 5185109








 86%|████████▌ | 655/762 [04:05<04:04,  2.29s/it][A[A[A[A[A[A

Downloading score id: 5396568








 86%|████████▌ | 656/762 [04:07<03:59,  2.26s/it][A[A[A[A[A[A

Downloading score id: 5219243








 86%|████████▌ | 657/762 [04:09<03:36,  2.06s/it][A[A[A[A[A[A

Downloading score id: 5290496








 86%|████████▋ | 658/762 [04:10<03:20,  1.93s/it][A[A[A[A[A[A

Downloading score id: 5102623








 86%|████████▋ | 659/762 [04:14<04:26,  2.59s/it][A[A[A[A[A[A

Downloading score id: 4163671








 87%|████████▋ | 660/762 [04:16<04:09,  2.44s/it][A[A[A[A[A[A

Downloading score id: 5380829








 87%|████████▋ | 661/762 [04:19<04:14,  2.52s/it][A[A[A[A[A[A

Downloading score id: 3589816








 87%|████████▋ | 662/762 [04:22<04:30,  2.71s/it][A[A[A[A[A[A

Downloading score id: 5350732








 87%|████████▋ | 663/762 [04:24<04:12,  2.55s/it][A[A[A[A[A[A

Downloading score id: 5467317








 87%|████████▋ | 664/762 [04:27<04:02,  2.48s/it][A[A[A[A[A[A

Downloading score id: 3435361








 87%|████████▋ | 665/762 [04:31<04:41,  2.90s/it][A[A[A[A[A[A

Downloading score id: 5142677








 87%|████████▋ | 666/762 [04:35<05:20,  3.34s/it][A[A[A[A[A[A

Downloading score id: 5483784








 88%|████████▊ | 667/762 [04:38<05:02,  3.19s/it][A[A[A[A[A[A

Downloading score id: 3404566








 88%|████████▊ | 668/762 [04:40<04:43,  3.02s/it][A[A[A[A[A[A

Downloading score id: 4610611








 88%|████████▊ | 669/762 [04:42<04:01,  2.60s/it][A[A[A[A[A[A

Downloading score id: 5209966








 88%|████████▊ | 670/762 [04:44<03:32,  2.31s/it][A[A[A[A[A[A

Downloading score id: 5383138








 88%|████████▊ | 671/762 [04:47<04:10,  2.76s/it][A[A[A[A[A[A

Downloading score id: 5353461








 88%|████████▊ | 672/762 [04:52<04:49,  3.22s/it][A[A[A[A[A[A

Downloading score id: 5350743








 88%|████████▊ | 673/762 [04:53<04:05,  2.75s/it][A[A[A[A[A[A

Downloading score id: 5294240








 88%|████████▊ | 674/762 [04:55<03:33,  2.43s/it][A[A[A[A[A[A

Downloading score id: 5232429








 89%|████████▊ | 675/762 [04:58<03:36,  2.49s/it][A[A[A[A[A[A

Downloading score id: 5377933








 89%|████████▊ | 676/762 [05:00<03:24,  2.38s/it][A[A[A[A[A[A

Downloading score id: 5032993








 89%|████████▉ | 677/762 [05:02<03:05,  2.18s/it][A[A[A[A[A[A

Downloading score id: 5391964








 89%|████████▉ | 678/762 [05:04<03:15,  2.33s/it][A[A[A[A[A[A

Downloading score id: 4992697








 89%|████████▉ | 679/762 [05:07<03:20,  2.42s/it][A[A[A[A[A[A

Downloading score id: 4867308








 89%|████████▉ | 680/762 [05:11<03:48,  2.79s/it][A[A[A[A[A[A

Downloading score id: 5465859








 89%|████████▉ | 681/762 [05:13<03:42,  2.74s/it][A[A[A[A[A[A

Downloading score id: 5446680








 90%|████████▉ | 682/762 [05:15<03:14,  2.43s/it][A[A[A[A[A[A

Downloading score id: 4284511








 90%|████████▉ | 683/762 [05:18<03:17,  2.50s/it][A[A[A[A[A[A

Downloading score id: 5502289








 90%|████████▉ | 684/762 [05:21<03:29,  2.69s/it][A[A[A[A[A[A

Downloading score id: 5456413








 90%|████████▉ | 685/762 [05:22<03:02,  2.37s/it][A[A[A[A[A[A

Downloading score id: 5090283








 90%|█████████ | 686/762 [05:25<03:06,  2.45s/it][A[A[A[A[A[A

Downloading score id: 5502013








 90%|█████████ | 687/762 [05:28<03:08,  2.51s/it][A[A[A[A[A[A

Downloading score id: 5510946








 90%|█████████ | 688/762 [05:30<03:10,  2.57s/it][A[A[A[A[A[A

Downloading score id: 5452558








 91%|█████████ | 690/762 [05:34<02:54,  2.42s/it][A[A[A[A[A[A

Downloading score id: 5169993








 91%|█████████ | 691/762 [05:37<02:45,  2.34s/it][A[A[A[A[A[A

Downloading score id: 5352426








 91%|█████████ | 692/762 [05:39<02:51,  2.45s/it][A[A[A[A[A[A

Downloading score id: 5350745








 91%|█████████ | 693/762 [05:42<02:54,  2.53s/it][A[A[A[A[A[A

Downloading score id: 5471266








 91%|█████████ | 694/762 [05:45<02:54,  2.56s/it][A[A[A[A[A[A

Downloading score id: 1368811








 91%|█████████ | 695/762 [05:46<02:33,  2.29s/it][A[A[A[A[A[A

Downloading score id: 3205021








 91%|█████████▏| 696/762 [05:49<02:47,  2.54s/it][A[A[A[A[A[A

Downloading score id: 5484954








 91%|█████████▏| 697/762 [05:52<02:37,  2.42s/it][A[A[A[A[A[A

Downloading score id: 5378683








 92%|█████████▏| 698/762 [05:55<02:47,  2.61s/it][A[A[A[A[A[A

Downloading score id: 5489856








 92%|█████████▏| 699/762 [05:57<02:47,  2.65s/it][A[A[A[A[A[A

Downloading score id: 5350728








 92%|█████████▏| 700/762 [06:01<02:58,  2.87s/it][A[A[A[A[A[A

Downloading score id: 5311316








 92%|█████████▏| 701/762 [06:03<02:51,  2.81s/it][A[A[A[A[A[A

Downloading score id: 5349423








 92%|█████████▏| 702/762 [06:06<02:46,  2.77s/it][A[A[A[A[A[A

Downloading score id: 5402473








 92%|█████████▏| 703/762 [06:09<02:50,  2.89s/it][A[A[A[A[A[A

Downloading score id: 5142671








 92%|█████████▏| 704/762 [06:12<02:44,  2.83s/it][A[A[A[A[A[A

Downloading score id: 5102488








 93%|█████████▎| 705/762 [06:15<02:46,  2.93s/it][A[A[A[A[A[A

Downloading score id: 5364881








 93%|█████████▎| 706/762 [06:19<02:56,  3.15s/it][A[A[A[A[A[A

Downloading score id: 5372813








 93%|█████████▎| 707/762 [06:22<02:53,  3.15s/it][A[A[A[A[A[A

Downloading score id: 5365395








 93%|█████████▎| 708/762 [06:25<02:44,  3.05s/it][A[A[A[A[A[A

Downloading score id: 5141366








 93%|█████████▎| 709/762 [06:27<02:28,  2.79s/it][A[A[A[A[A[A

Downloading score id: 5335480








 93%|█████████▎| 710/762 [06:29<02:08,  2.46s/it][A[A[A[A[A[A

Downloading score id: 5479087








 93%|█████████▎| 711/762 [06:31<02:01,  2.39s/it][A[A[A[A[A[A

Downloading score id: 5372870








 93%|█████████▎| 712/762 [06:34<02:11,  2.62s/it][A[A[A[A[A[A

Downloading score id: 5045490








 94%|█████████▎| 713/762 [06:36<02:02,  2.49s/it][A[A[A[A[A[A

Downloading score id: 5396565








 94%|█████████▎| 714/762 [06:38<01:47,  2.25s/it][A[A[A[A[A[A

Downloading score id: 5313405








 94%|█████████▍| 715/762 [06:40<01:44,  2.21s/it][A[A[A[A[A[A

Downloading score id: 5486905








 94%|█████████▍| 716/762 [06:44<02:01,  2.64s/it][A[A[A[A[A[A

Downloading score id: 5410425








 94%|█████████▍| 717/762 [06:46<01:59,  2.65s/it][A[A[A[A[A[A

Downloading score id: 5084226








 94%|█████████▍| 718/762 [06:48<01:43,  2.34s/it][A[A[A[A[A[A

Downloading score id: 5398553








 94%|█████████▍| 719/762 [06:51<01:51,  2.60s/it][A[A[A[A[A[A

Downloading score id: 5485234








 94%|█████████▍| 720/762 [06:53<01:44,  2.48s/it][A[A[A[A[A[A

Downloading score id: 5501668








 95%|█████████▍| 721/762 [06:57<01:51,  2.72s/it][A[A[A[A[A[A

Downloading score id: 5299238








 95%|█████████▍| 722/762 [06:58<01:37,  2.45s/it][A[A[A[A[A[A

Downloading score id: 5492915








 95%|█████████▍| 723/762 [07:00<01:26,  2.22s/it][A[A[A[A[A[A

Downloading score id: 5519552








 95%|█████████▌| 724/762 [07:04<01:46,  2.80s/it][A[A[A[A[A[A

Downloading score id: 5446399








 95%|█████████▌| 725/762 [07:07<01:42,  2.77s/it][A[A[A[A[A[A

Downloading score id: 5500282








 95%|█████████▌| 726/762 [07:10<01:40,  2.80s/it][A[A[A[A[A[A

Downloading score id: 5142661








 95%|█████████▌| 727/762 [07:11<01:25,  2.45s/it][A[A[A[A[A[A

Downloading score id: 5392388








 96%|█████████▌| 728/762 [07:14<01:25,  2.52s/it][A[A[A[A[A[A

Downloading score id: 5046084








 96%|█████████▌| 729/762 [07:17<01:25,  2.60s/it][A[A[A[A[A[A

Downloading score id: 5501907








 96%|█████████▌| 730/762 [07:19<01:22,  2.57s/it][A[A[A[A[A[A

Downloading score id: 5215968








 96%|█████████▌| 731/762 [07:22<01:17,  2.50s/it][A[A[A[A[A[A

Downloading score id: 5512292








 96%|█████████▌| 732/762 [07:24<01:16,  2.54s/it][A[A[A[A[A[A

Downloading score id: 5495183








 96%|█████████▌| 733/762 [07:26<01:06,  2.28s/it][A[A[A[A[A[A

Downloading score id: 5482006








 96%|█████████▋| 734/762 [07:29<01:10,  2.54s/it][A[A[A[A[A[A

Downloading score id: 5500103








 96%|█████████▋| 735/762 [07:31<01:00,  2.26s/it][A[A[A[A[A[A

Downloading score id: 5476269








 97%|█████████▋| 736/762 [07:34<01:05,  2.54s/it][A[A[A[A[A[A

Downloading score id: 5508596








 97%|█████████▋| 737/762 [07:37<01:04,  2.57s/it][A[A[A[A[A[A

Downloading score id: 5509826








 97%|█████████▋| 738/762 [07:39<01:02,  2.59s/it][A[A[A[A[A[A

Downloading score id: 5474344








 97%|█████████▋| 739/762 [07:42<01:03,  2.75s/it][A[A[A[A[A[A

Downloading score id: 5450692








 97%|█████████▋| 740/762 [07:44<00:53,  2.44s/it][A[A[A[A[A[A

Downloading score id: 5252689








 97%|█████████▋| 741/762 [07:47<00:52,  2.50s/it][A[A[A[A[A[A

Downloading score id: 5489197








 97%|█████████▋| 742/762 [07:49<00:51,  2.55s/it][A[A[A[A[A[A

Downloading score id: 5453541








 98%|█████████▊| 743/762 [07:51<00:43,  2.29s/it][A[A[A[A[A[A

Downloading score id: 5520888








 98%|█████████▊| 744/762 [07:54<00:43,  2.40s/it][A[A[A[A[A[A

Downloading score id: 5416048








 98%|█████████▊| 745/762 [07:56<00:42,  2.48s/it][A[A[A[A[A[A

Downloading score id: 5476284








 98%|█████████▊| 746/762 [07:58<00:35,  2.22s/it][A[A[A[A[A[A

Downloading score id: 5508035








 98%|█████████▊| 747/762 [08:01<00:35,  2.35s/it][A[A[A[A[A[A

Downloading score id: 5522112








 98%|█████████▊| 748/762 [08:03<00:33,  2.39s/it][A[A[A[A[A[A

Downloading score id: 5522878








 98%|█████████▊| 749/762 [08:06<00:32,  2.47s/it][A[A[A[A[A[A

Downloading score id: 5351084








 98%|█████████▊| 750/762 [08:09<00:30,  2.54s/it][A[A[A[A[A[A

Downloading score id: 5323733








 99%|█████████▊| 751/762 [08:12<00:29,  2.72s/it][A[A[A[A[A[A

Downloading score id: 5518661








 99%|█████████▊| 752/762 [08:14<00:27,  2.70s/it][A[A[A[A[A[A

Downloading score id: 5424772








 99%|█████████▉| 753/762 [08:17<00:24,  2.71s/it][A[A[A[A[A[A

Downloading score id: 5523808








 99%|█████████▉| 754/762 [08:19<00:19,  2.40s/it][A[A[A[A[A[A

Downloading score id: 5495174








 99%|█████████▉| 755/762 [08:20<00:15,  2.17s/it][A[A[A[A[A[A

Downloading score id: 5515546








 99%|█████████▉| 756/762 [08:23<00:14,  2.35s/it][A[A[A[A[A[A

Downloading score id: 5505919








 99%|█████████▉| 757/762 [08:25<00:10,  2.13s/it][A[A[A[A[A[A

Downloading score id: 5421063








 99%|█████████▉| 758/762 [08:29<00:11,  2.81s/it][A[A[A[A[A[A

Downloading score id: 5522116








100%|█████████▉| 759/762 [08:31<00:07,  2.60s/it][A[A[A[A[A[A

Downloading score id: 5522121








100%|█████████▉| 760/762 [08:34<00:05,  2.60s/it][A[A[A[A[A[A

Downloading score id: 5518526








100%|█████████▉| 761/762 [08:37<00:02,  2.78s/it][A[A[A[A[A[A

Downloading score id: 5524548








100%|██████████| 762/762 [08:39<00:00,  2.43s/it][A[A[A[A[A[A





[A[A[A[A[A[A

In [21]:
len(all_proxies)

300

In [35]:

# del all_proxies[proxy_index]
# proxy = None

In [36]:
for idx,link in enumerate(tqdm(links, total=len(links))):
    score_id = link['score_id']
    out_file = f"data/{score_id}.mxl"
    if Path(out_file).exists() or score_id in invalid_ids: continue
    # Every 10 requests, generate a new proxy
    if idx % 10 == 0 or proxy is None or instance is None:
        proxy_index = random.randint(0, len(all_proxies) - 1)
        proxy = all_proxies[proxy_index]
        proxy_url = proxy['ip'] + ':' + str(proxy['port']); proxy_url
        instance_index = random.randint(0, len(instances) - 1)
        instance = instances[instance_index]
    try:
        print('Downloading score id:', score_id)
        
        start = time.time()
        instance.download(score_id, out_file, format='mxl', proxy=proxy_url)
        end = time.time()
        if (end - start) > 15: raise Exception('Took too long to download')
    except Exception as e:
        print('Could not download id:', score_id)
        print('Error:', e)
        invalid_ids.append(score_id)
        deleted_proxies.append(all_proxies[proxy_index])
        del all_proxies[proxy_index]
        print('Proxy ' + proxy['ip'] + ':' + str(proxy['port']) + ' deleted.')
        print(instance.username)
        proxy = None
        instance = None
    






  0%|          | 0/762 [00:00<?, ?it/s][A[A[A[A[A

Downloading score id: 2109666


KeyboardInterrupt: 

### Second run

In [23]:
# Main function
  # Retrieve latest proxies
proxies_req = Request('https://www.sslproxies.org/')
proxies_req.add_header('User-Agent', ua.random)
proxies_doc = urlopen(proxies_req).read().decode('utf8')

soup = BeautifulSoup(proxies_doc, 'html.parser')
proxies_table = soup.find(id='proxylisttable')

# Save proxies in the array
for row in proxies_table.tbody.find_all('tr'):
    proxies.append({
      'ip':   row.find_all('td')[0].string,
      'port': row.find_all('td')[1].string
    })

HTTPError: HTTP Error 403: Forbidden

In [None]:
import asyncio
from proxybroker import Broker

more_proxies = []

async def show(proxy_queue):
    while True:
        proxy = await proxy_queue.get()
        if proxy is None: break
        print('Found proxy: %s' % proxy)
        more_proxies.append({ 'ip': proxy.host, 'port': proxy.port })

proxy_queue = asyncio.Queue()
broker = Broker(proxy_queue)
tasks = await asyncio.gather(
    broker.find(types=['HTTP', 'HTTPS'], limit=400),
    show(proxy_queue))

In [None]:
all_proxies = proxies + more_proxies
# Retrieve a random index proxy (we need the index to delete it if not working)
def random_proxy():
    return random.randint(0, len(all_proxies) - 1)

proxy = None
# # Choose a random proxy
# proxy_index = random_proxy()
# proxy = all_proxies[proxy_index]
# proxy_url = proxy['ip'] + ':' + str(proxy['port']); proxy_url

In [None]:
json_file = Path('musescore_movie.json')

In [None]:
if json_file.exists():
    with open(json_file, 'r') as fp:
        links = json.load(fp)
        

In [None]:
for idx,link in enumerate(tqdm(links, total=len(links))):
    score_id = link['score_id']
    out_file = f"data/{score_id}.mxl"
    if Path(out_file).exists() or score_id in invalid_ids: continue
    # Every 10 requests, generate a new proxy
    if idx % 10 == 0 or proxy is None or instance is None:
        proxy_index = random.randint(0, len(all_proxies) - 1)
        proxy = all_proxies[proxy_index]
        proxy_url = proxy['ip'] + ':' + str(proxy['port']); proxy_url
        instance_index = random.randint(0, len(instances) - 1)
        instance = instances[instance_index]
    try:
        print('Downloading score id:', score_id)
        instance.download(score_id, out_file, format='mxl', proxy=proxy_url)
    except Exception as e:
        print('Could not download id:', score_id)
        print('Error:', e)
        invalid_ids.append(score_id)
        deleted_proxies.append(all_proxies[proxy_index])
        del all_proxies[proxy_index]
        print('Proxy ' + proxy['ip'] + ':' + str(proxy['port']) + ' deleted.')
        print(instance.username)
        proxy = None
        instance = None
    sleep(randint(1,2))
    