 <center> Data Downloads and Embedding </center>
 
 This notebook contains information on the source of all natural language and music data used in the paper, along with code to embed the raw data as a real-valued multivariate time series.

In [None]:
import numpy as np
import os
import re
import torch

### Natural language data

#### 0. GloVe embeddings

We embed the natural language data using the 200d pre-trained GloVe embeddings, which can be downloaded at http://nlp.stanford.edu/data/glove.6B.zip

Once downloaded, use the following code to load the embeddings:

In [None]:
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done. {} words loaded!".format(len(model)))
    return model

glove200 = None # replace with path to glove.6B.200d.txt file
gl_embed = loadGloveModel(glove200)

#### 1. Penn TreeBank

In [None]:
# tools to parse Penn TreeBank data, adapted from the PyTorch language modeling tutorial
class Dictionary:
    def __init__(self):
        self.word_to_ix = {}
        self.ix_to_word = []
        
    def __len__(self):
        return len(self.ix_to_word)
    
    def add_word(self,word):
        if word not in self.word_to_ix:
            self.ix_to_word.append(word)
            self.word_to_ix[word] = len(self.ix_to_word)-1
        return self.word_to_ix[word]
    
class Corpus:
    def __init__(self,path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(os.path.join(path,'ptb.train.txt'))
        self.valid = self.tokenize(os.path.join(path,'ptb.valid.txt'))
        self.test = self.tokenize(os.path.join(path,'ptb.test.txt'))
        
    def tokenize(self,path):
        # first add words to dictionary
        with open(path) as f:
            tokens = 0
            for line in f:
                words = line.split()+['<eos>']
                tokens += len(words)
                for word in words:
                    self.dictionary.add_word(word)
        
        # then return tokenized file content
        with open(path) as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word_to_ix[word]
                    token += 1
        return ids

In [None]:
ptb_path = None # replace with path to Penn TreeBank train / validation / test .txt files
corpus = Corpus(ptb_path)

In [None]:
# code for embedding
print('Embedding... ')
train_emb = np.zeros((200,len(corpus.train)))
count = 0
for t in range(len(corpus.train)):
    wd = corpus.dictionary.ix_to_word[corpus.train[t]]
    if wd == '<eos>':
        wd = 'eos'
    train_emb[:,t] = gl_embed.get(wd,gl_embed['unk'])
    if gl_embed.get(wd,gl_embed['unk']) is gl_embed['unk']:
        count += 1
train_emb = torch.FloatTensor(train_emb).t().cuda() # shape to T x k
print('... train (shape {} x {}), unk count: {}'.format(train_emb.shape[0],train_emb.shape[1],count))  

valid_emb = np.zeros((200,len(corpus.valid)))
count = 0
for t in range(len(corpus.valid)):
    wd = corpus.dictionary.ix_to_word[corpus.valid[t]]
    if wd == '<eos>':
        wd = 'eos'
    valid_emb[:,t] = gl_embed.get(wd,gl_embed['unk'])
    if gl_embed.get(wd,gl_embed['unk']) is gl_embed['unk']:
        count += 1
valid_emb = torch.FloatTensor(valid_emb).t().cuda()
print('... valid (shape {} x {}), unk count: {}'.format(valid_emb.shape[0],valid_emb.shape[1],count)) 

test_emb = np.zeros((200,len(corpus.test)))
count = 0
for t in range(len(corpus.test)):
    wd = corpus.dictionary.ix_to_word[corpus.test[t]]
    if wd == '<eos>':
        wd = 'eos'
    test_emb[:,t] = gl_embed.get(wd,gl_embed['unk'])
    if gl_embed.get(wd,gl_embed['unk']) is gl_embed['unk']:
        count += 1
test_emb = torch.FloatTensor(test_emb).t().cuda()
print('... test (shape {} x {}), unk count: {}'.format(test_emb.shape[0],test_emb.shape[1],count))

#### 2. King James Bible

The text is available from Project Gutenberg at http://www.gutenberg.org/cache/epub/10/pg10.txt

In [None]:
''' 
This pre-processing step performs the following:

- ignore 32 lines of Gutenberg intro at start
- strip chapter:verse from line where applicable
- send to lower case
- keep punctuation but separate from text with whitespace '''

path_to_bible = None # replace with path to bible txt file

with open(path_to_bible) as f:
    text = []
    i=0
    line_count = 0
    word_count = 0
    for line in f:
        line_count += 1
        word_count += len(line.split())
        if line_count>32:
            if len(line.rstrip())>0:
                text += [re.sub(r'([a-z]+)([:,?!.$])',r'\1 \2',re.sub('\d+:\d+ ','',line.lower().rstrip()))]
    print('total lines: {}'.format(line_count))
    print('total words: {}'.format(word_count))

In [None]:
# embed the data
bible_emb = np.zeros((200,len(full_text)))
for t in range(len(full_text)):
    bible_emb[:,t] = gl_embed.get(full_text[t],gl_embed['unk'])

#### 3. Facebook bAbI: Children's book test

The data can be downloaded from http://www.thespermwhale.com/jaseweston/babi/CBTest.tgz

In [None]:
cbdata =  None # path to downloaded cbt_train.txt file

# read individual books from training set, store separately
with open(cbdata) as f:
    fb_text = {}
    i=0
    line_count = 0
    word_count = 0
    for line in f:
        if line.split()[0] == '_BOOK_TITLE_':
            key = ''.join(line.strip('\n'))
            key = key.split('___')[1].split('.')[0].replace('_',' ')
            fb_text[key] = {'text':[]}
        else:
            fb_text[key]['text'] += line.strip('\n').split()
        
        line_count += 1
        word_count += len(line.split())

    print('total lines: {}'.format(line_count))
    print('total words: {}'.format(word_count))

In [None]:
# embed
for k,v in fb_text.items():
    text_emb = np.zeros((200,len(v['text'])))
    for t in range(len(v['text'])):
        text_emb[:,t] = gl_embed.get(v['text'][t],gl_embed['unk'])
    v['embedded'] = text_emb

### Music data

#### 1. Miles Davis' *Kind of Blue*

The data was obtained by purchasing the album and converting the files to wav.

In [None]:
# tools for MFCC embedding of jazz and vocal performance
import librosa as lbr
import scipy.io.wavfile as wav

In [None]:
md_dir = None # replace with path to directory containing .wav files

md_files = os.listdir(uk_dir)
md_out = {}
for f in md_files:
    if '.wav' in f:
        name = f.split('.')[0]
        md_out[name] = {}
        path = md_dir+f
        y,sr = lbr.load(path,sr = 32000)
        mfc = lbr.feature.mfcc(y,sr=sr)
        md_out[name]['data'] = mfc

#### 2. Oum Kalthoum

The data consists of the following tracks, which were purchased and converted to wav. All are available, for example, in high-quality remastered format on iTunes.

- We Maret El Ayam Daret El Ayam
- Seret El Hob
- Alf Leila We Leila
- Amal Hayate
- El Ward Gamel
- Fakarony

In [None]:
uk_dir = None # replace with path to directory containing .wav files

uk_files = os.listdir(uk_dir)
uk_out = {}
for f in uk_files:
    if '.wav' in f:
        name = f.split('.')[0]
        uk_out[name] = {}
        path = uk_dir+f
        y,sr = lbr.load(path,sr = 32000)
        mfc = lbr.feature.mfcc(y,sr=sr)
        uk_out[name]['seq'] = mfc

#### 3. MusicNet and Bach's *Cello Suite 4*

Code and data for MusicNet are available at https://homes.cs.washington.edu/~thickstn/start.html

We require both the raw data in `musicnet.npz` as well as the metadata file `musicnet_metadata.csv`.

The embeddings are obtained from a reduced version of the model implemented in `musicnet_module.ipynb`, which is available at https://github.com/jthickstun/pytorch_musicnet

In [None]:
bach_cello_id = [str(x) for x in range(2293,2299)]

train_data_path = None # replace with path to musicnet.npz file
train_data = np.load(open(train_data_path,'rb'))
bach_cello = {}
for b in bach_cello_id:
    bach_cello[b] = {'seq':train_data[b][0]}

In [None]:
def embed_piece(seq): # embed using convolutional features of learned model
    T = len(seq)
    starts = np.arange(0,T-window,window)
    N = len(starts)
    
    embedded = torch.zeros((k,regions*N))
    
    for i in range(N):
        x = seq[starts[i]:starts[i]+window]
        x = torch.FloatTensor(np.expand_dims(x,0)).cuda()
        zx = conv1d(x[:,None,:], model.wsin_var, stride=stride).pow(2) \
           + conv1d(x[:,None,:], model.wcos_var, stride=stride).pow(2)
        embedded[:,i*regions:(i+1)*regions] = np.log(zx[0,:,:]+eps)
    return embedded

The following block assumes that a MusicNet model has been trained and is available to be loaded. As described in the Supplement, the model we train follows exactly the code in `musicnet_module.ipynb`, with the single exception that we reduce the size of the hidden representation to `k=200`.

In [None]:
window = 2**14
eps = 10e-8
model_path = None # replace with path to trained model

with open('model_path', 'rb') as f:
    model = torch.load(model_path)

for b in bach_cello.keys(): # intermediate embedded CSV files
    print b
    seq = bach_cello[b]['seq']
    emb = embed_piece(seq).data.numpy()
    f = 'bach_cello_'+b+'.csv'
    np.savetxt(f,emb,delimiter=',')

In [None]:
# gather files
bach = {}
for f in fi_fugue+fi_cello:
    bfile = '../data/bach/'+f
    ix = f[5:-4]
    hid = np.loadtxt(bfile,delimiter=',')
    tf = np.isnan(hid).any()
    if not np.isnan(hid).any():
        bach[ix] = {'data':hid}

In [None]:
# add metadata
path_to_metadata = None # replace with path to musicnet_metadata.csv file
mn_meta = np.genfromtxt(path_to_metadata,dtype=None,delimiter=',',skip_header=1,usecols=(0,1,2,3,8))
bach_meta = [x for x in mn_meta if x[1].decode('UTF-8').strip('\"')=='Bach']
for bm in bach_meta:
    for k in bach.keys():
        if str(bm[0]) in k:
            bach[k]['meta'] = bm

In [None]:
# arrange into single sequence, with correct order of pieces
csuite = np.empty((200,0))
mvment = 1
while mvment < 7:
    for k,v in bach.items():
        if 'cello' in k:
            if int(v['meta'][3].decode('UTF-8').strip('\"')[0])==mvment:
                print('Adding {}. Shape = {} x {}'.format(v['meta'][3],v['data'].shape[0],v['data'].shape[1]))
                csuite = np.hstack((csuite,v['data']))
                mvment += 1