In [2]:
import json
import os
import matplotlib.pyplot as plt
import numpy as np
import requests
from bs4 import BeautifulSoup
import muspy
import pandas as pd
import torch
from torch.utils.data import DataLoader
from pathlib import PosixPath

In [3]:
def find_local_mscz(raw_path, dpath='/data2/zachary/musescore/data'):
    clean_path = raw_path.split("/")[-1] + '.mscz'
    for dirpath, dirnames, filenames in os.walk(dpath):
        if not dirnames:
            if clean_path in filenames:
                return os.path.join(dirpath, clean_path)

In [4]:
fpath_df = pd.read_csv('/data2/zachary/musescore/mscz-files.csv')
fpath_df.set_index('id', inplace=True)

In [5]:
all_files = []
for dirpath, dirnames, filenames in os.walk('/data2/zachary/musescore/data'):
    if not dirnames:
        all_files += list(map(lambda x: os.path.join(dirpath, x), filenames))

In [6]:
all_files = {x.split("/")[-1]: x for x in all_files}

In [7]:
fpath_df['ref2'] = fpath_df.ref.apply(lambda x: x.split("/")[-1] + '.mscz')
fpath_df['loc_exist'] = fpath_df.ref2.apply(lambda x: x in all_files)
fpath_df = fpath_df[fpath_df['loc_exist'] == True]

In [8]:
fpath_df['ref3'] = fpath_df.ref2.apply(lambda x: all_files[x])

In [9]:
dirpath = '/data2/zachary/musescore/metadata'

In [10]:
compls = []
for fpath in os.listdir(dirpath):
    if fpath.isnumeric():
        subdir = os.path.join(dirpath, fpath)
        for subpath in os.listdir(subdir):
            with open(subdir + '/' + subpath, 'r') as f:
                try:
                    d = json.load(f)
                    compls.append({'path': subdir + '/' + subpath, 'd': d})
                except:
                    print(subdir + '/' + subpath)

In [19]:
def dur2Num(dur):
    ms = dur.split(":")
    if len(ms) == 1:
        return int(ms[0])
    return int(ms[0]) * 60 + int(ms[1])

In [24]:
pianos = list(filter(lambda x: 'piano' in [y.lower() for y in x['d']['data']['score']['parts_names']] and len(x['d']['data']['score']['parts_names']) == 1 and len(x['d']) and dur2Num(x['d']['data']['score']['duration']) > 20, compls))
pianos = list(filter(lambda x: x['d']['data']['score']['id'] in fpath_df.index, pianos))
nans = list(filter(lambda x: x['d']['data']['score']['complexity'] == 0, pianos))
easy = list(filter(lambda x: x['d']['data']['score']['complexity'] == 1, pianos))
med = list(filter(lambda x: x['d']['data']['score']['complexity'] == 2, pianos))
hard = list(filter(lambda x: x['d']['data']['score']['complexity'] == 3, pianos))
np.random.seed(42)
easy_eq = np.random.choice(easy, 4400)
med_eq = np.random.choice(med, 4400)
hard_eq = np.random.choice(hard, 4400)
train_ds = np.concatenate([easy_eq[:int(4400*0.8)], med_eq[:int(4400*0.8)], hard_eq[:int(4400*0.8)]])
val_ds = np.concatenate([easy_eq[int(4400*0.8):int(4400*0.8) + int(4400*0.1)], med_eq[int(4400*0.8):int(4400*0.8) + int(4400*0.1)], hard_eq[int(4400*0.8):int(4400*0.8) + int(4400*0.1)]])
test_ds = np.concatenate([easy_eq[int(4400*0.8) + int(4400*0.1):], med_eq[int(4400*0.8) + int(4400*0.1):], hard_eq[int(4400*0.8) + int(4400*0.1):]])

In [25]:
times = []
for piece in easy_eq:
    tcode = piece['d']['data']['score']['duration']
    if len(tcode.split(":")) == 2:
        times.append(int(tcode.split(":")[0]) * 60 + int(tcode.split(":")[1]))
    else:
        times.append(int(tcode.split(":")[0]))
for piece in med_eq:
    tcode = piece['d']['data']['score']['duration']
    if len(tcode.split(":")) == 2:
        times.append(int(tcode.split(":")[0]) * 60 + int(tcode.split(":")[1]))
    else:
        times.append(int(tcode.split(":")[0]))
for piece in hard_eq:
    tcode = piece['d']['data']['score']['duration']
    if len(tcode.split(":")) == 2:
        times.append(int(tcode.split(":")[0]) * 60 + int(tcode.split(":")[1]))
    else:
        times.append(int(tcode.split(":")[0]))

In [26]:
sum(times) / 3600

770.6694444444445

In [27]:
times = []
for piece in train_ds:
    tcode = piece['d']['data']['score']['duration']
    if len(tcode.split(":")) == 2:
        times.append(int(tcode.split(":")[0]) * 60 + int(tcode.split(":")[1]))
    else:
        times.append(int(tcode.split(":")[0]))
print("train time:", sum(times) / 3600)

times = []
for piece in val_ds:
    tcode = piece['d']['data']['score']['duration']
    if len(tcode.split(":")) == 2:
        times.append(int(tcode.split(":")[0]) * 60 + int(tcode.split(":")[1]))
    else:
        times.append(int(tcode.split(":")[0]))
print("val time:", sum(times) / 3600)

times = []
for piece in test_ds:
    tcode = piece['d']['data']['score']['duration']
    if len(tcode.split(":")) == 2:
        times.append(int(tcode.split(":")[0]) * 60 + int(tcode.split(":")[1]))
    else:
        times.append(int(tcode.split(":")[0]))
print("test time:", sum(times) / 3600)

train time: 612.1686111111111
val time: 79.02111111111111
test time: 79.47972222222222


In [28]:
for i, d in enumerate(train_ds):
    raw_path = fpath_df.loc[d['d']['data']['score']['id']]['ref3']
    if type(raw_path) == pd.Series:
        raw_path = raw_path.iloc[0]
    d['mscz_path'] = raw_path
    train_ds[i] = d
for i, d in enumerate(val_ds):
    raw_path = fpath_df.loc[d['d']['data']['score']['id']]['ref3']
    if type(raw_path) == pd.Series:
        raw_path = raw_path.iloc[0]
    d['mscz_path'] = raw_path
    val_ds[i] = d
for i, d in enumerate(test_ds):
    raw_path = fpath_df.loc[d['d']['data']['score']['id']]['ref3']
    if type(raw_path) == pd.Series:
        raw_path = raw_path.iloc[0]
    d['mscz_path'] = raw_path
    test_ds[i] = d

In [29]:
id_train_ds = {d['d']['data']['score']['id']: d for d in train_ds}
with open('../data/train_metadata.json', 'w') as f:
    json.dump(id_train_ds, f, indent=2)
id_val_ds = {d['d']['data']['score']['id']: d for d in val_ds}
with open('../data/val_metadata.json', 'w') as f:
    json.dump(id_val_ds, f, indent=2)
id_test_ds = {d['d']['data']['score']['id']: d for d in test_ds}
with open('../data/test_metadata.json', 'w') as f:
    json.dump(id_test_ds, f, indent=2)

In [2]:
with open('../data/train_metadata.json', 'r') as f:
    id_train_ds = json.load(f)
with open('../data/val_metadata.json', 'r') as f:
    id_val_ds = json.load(f)
with open('../data/test_metadata.json', 'r') as f:
    id_test_ds = json.load(f)

In [105]:
os.system("mkdir /data2/zachary/musescore/coarse_data/train/")
os.system("mkdir /data2/zachary/musescore/coarse_data/val/")
os.system("mkdir /data2/zachary/musescore/coarse_data/test/")
for idx, d in id_train_ds.items():
    os.system(f"cp {d['mscz_path']} /data2/zachary/musescore/coarse_data/train/")
for idx, d in id_val_ds.items():
    os.system(f"cp {d['mscz_path']} /data2/zachary/musescore/coarse_data/val/")
for idx, d in id_test_ds.items():
    os.system(f"cp {d['mscz_path']} /data2/zachary/musescore/coarse_data/test/")

In [123]:
for f in os.listdir('/data2/zachary/musescore/coarse_data/train/'):
    try:
        x = muspy.read_musescore(os.path.join('/data2/zachary/musescore/coarse_data/train/', f))
        if len(x.tracks[0]) < 10:
            os.system(f"rm /data2/zachary/musescore/coarse_data/train/{f}")
    except:
        os.system(f"rm /data2/zachary/musescore/coarse_data/train/{f}")
for f in os.listdir('/data2/zachary/musescore/coarse_data/val/'):
    try:
        x = muspy.read_musescore(os.path.join('/data2/zachary/musescore/coarse_data/val/', f))
        if len(x.tracks[0]) < 10:
            os.system(f"rm /data2/zachary/musescore/coarse_data/val/{f}")
    except:
        os.system(f"rm /data2/zachary/musescore/coarse_data/val/{f}")
for f in os.listdir('/data2/zachary/musescore/coarse_data/test/'):
    try:
        x = muspy.read_musescore(os.path.join('/data2/zachary/musescore/coarse_data/test/', f))
        if len(x.tracks[0]) < 10:
            os.system(f"rm /data2/zachary/musescore/coarse_data/test/{f}")
    except:
        os.system(f"rm /data2/zachary/musescore/coarse_data/test/{f}")



In [2]:
class LocalMuseScoreDataset(muspy.FolderDataset):

    _extension = 'mscz'
    _info = muspy.DatasetInfo("MuseScore Dataset")
    

    def read(self, filename) -> muspy.Music:
        """Read a file into a Music object."""
        obj = muspy.read_musescore(filename)
        return obj
    
    def to_pytorch_dataset(
        self,
        factory=None,
        representation: str = None,
        split_filename=None,
        splits=None,
        random_state=None,
        labpath=None,
        **kwargs):
        """Return the dataset as a PyTorch dataset.

        Parameters
        ----------
        factory : Callable, optional
            Function to be applied to the Music objects. The input is a
            Music object, and the output is an array or a tensor.
        representation : str, optional
            Target representation. See :func:`muspy.to_representation()`
            for available representation.
        split_filename : str or Path, optional
            If given and exists, path to the file to read the split
            from. If None or not exists, path to save the split.
        splits : float or list of float, optional
            Ratios for train-test-validation splits. If None, return the
            full dataset as a whole. If float, return train and test
            splits. If list of two floats, return train and test splits.
            If list of three floats, return train, test and validation
            splits.
        random_state : int, array_like or RandomState, optional
            Random state used to create the splits. If int or
            array_like, the value is passed to
            :class:`numpy.random.RandomState`, and the created
            RandomState object is used to create the splits. If
            RandomState, it will be used to create the splits.

        Returns
        -------
        :class:torch.utils.data.Dataset` or Dict of \
                :class:torch.utils.data.Dataset`
            Converted PyTorch dataset(s).

        """
        if representation is None and factory is None:
            raise TypeError(
                "One of `representation` and `factory` must be given."
            )
        if representation is not None and factory is not None:
            raise TypeError(
                "Only one of `representation` and `factory` can be given."
            )

        try:
            # pylint: disable=import-outside-toplevel
            from torch.utils.data import Dataset as TorchDataset
        except ImportError as err:
            raise ImportError("Optional package pytorch is required.") from err

        class TorchMusicFactoryDataset(TorchDataset):
            """A PyTorch dataset built from a Music dataset.

            Parameters
            ----------
            dataset : :class:`muspy.Dataset`
                Dataset object to base on.
            factory : Callable
                Function to be applied to the Music objects. The input is a
                Music object, and the output is an array or a tensor.

            """

            def __init__(
                self,
                dataset,
                factory,
                labpath,
                subset: str = "Full",
                indices=None,
            ):
                self.dataset = dataset
                self.factory = factory
                self.subset = subset
                self.indices = indices
                self.dataset.on_the_fly()
        
                with open(labpath, 'r') as f:
                    set_files = set(self.dataset._filenames)
                    self.metadata = json.load(f)
                    self.metadata = {k: v for k,v in list(filter(lambda x: PosixPath(os.path.join(self.dataset.root, x[1]['mscz_path'].split("/")[-1])) in set_files, self.metadata.items()))}
                    self.metadata = {PosixPath(os.path.join(self.dataset.root, d['mscz_path'].split("/")[-1])): d for d in self.metadata.values()}
                    self.labels = {k: v['d']['data']['score']['complexity'] for k,v in self.metadata.items()}
                    self.labels = [x[1] for x in sorted(self.labels.items(), key=lambda y: self.dataset._filenames.index(y[0]))]
                self.dataset.use_converted()
                    
                if self.indices is not None:
                    self.indices = sorted(
                        idx for idx in self.indices if idx < len(self.dataset)
                    )

            def __repr__(self) -> str:
                return (
                    f"TorchMusicFactoryDataset(dataset={self.dataset}, "
                    f"factory={self.subset}, subset={self.factory})"
                )

            def __getitem__(self, index):
                if self.indices is None:
                    return self.factory(self.dataset[index]), self.labels[index]
                return self.factory(self.dataset[self.indices[index]]), self.labels[self.indices[index]]

            def __len__(self) -> int:
                if self.indices is None:
                    return len(self.dataset)
                return len(self.indices)

        class TorchRepresentationDataset(TorchMusicFactoryDataset):
            """A PyTorch music dataset.

            Parameters
            ----------
            dataset : :class:`muspy.Dataset`
                Dataset object to base on.
            representation : str
                Target representation. See
                :func:`muspy.to_representation()` for available
                representation.

            """

            def __init__(
                self,
                dataset,
                labpath,
                representation: str,
                subset="Full",
                indices=None,
                **kwargs,
            ):
                self.representation = representation

                def factory(music):
                    return music.to_representation(representation, **kwargs)

                super().__init__(
                    dataset, labpath=labpath, factory=factory, subset=subset, indices=indices
                )

            def __repr__(self) -> str:
                return (
                    f"TorchRepresentationDataset(dataset={self.dataset}, "
                    f"representation={self.representation}, "
                    f"subset={self.subset})"
                )

        # No split
        if splits is None:
            if representation is not None:
                return TorchRepresentationDataset(
                    self,labpath, representation, **kwargs
                )
            return TorchMusicFactoryDataset(self, factory, labpath=labpath)  # type: ignore

        datasets= {}
        indices_list = self.split(split_filename, splits, random_state)
        for key, value in indices_list.items():
            if representation is not None:
                datasets[key] = TorchRepresentationDataset(
                    self, representation, key, value, **kwargs
                )
            else:

                datasets[key] = TorchMusicFactoryDataset(
                    self, factory, key, value  # type: ignore
                )

        return datasets

In [3]:
train = LocalMuseScoreDataset('/data2/zachary/musescore/coarse_data/train/')
trainDataset = train.to_pytorch_dataset(representation='event', labpath='../data/train_metadata.json')
val = LocalMuseScoreDataset('/data2/zachary/musescore/coarse_data/val/')
valDataset = val.to_pytorch_dataset(representation='event', labpath='../data/val_metadata.json')
test = LocalMuseScoreDataset('/data2/zachary/musescore/coarse_data/test/')
testDataset = test.to_pytorch_dataset(representation='event', labpath='../data/test_metadata.json')

In [4]:
train_dataloader = DataLoader(trainDataset)
val_dataloader = DataLoader(valDataset)
test_dataloader = DataLoader(testDataset)