Alan Sun

May 25, 2021

# Preprocessing
- We tokenize all of the inputs (mood matrices) as well as append `<sos>`, `<eos>` tokens to the sequences. 
- We convert all such data into a dataframe and store the dataframe. 

## Extract all the unique moods

In [29]:
import json
import torch
import numpy as np
import pandas as pd

In [6]:
features = json.load(open('playlist-tracks-features'))

In [7]:
moods = {}
for i in features.keys():
    if 'iso01' in i or 'iso02' in i:
        continue
    states = i[8:].split(' to ')
    states = [s.lower().strip() for s in states]
    states = [[x.lower().strip() for x in s.split(', ')] for s in states]
    moods[i[:5]] = states

## Tokenize mood states and location labels

In [10]:
class Tokenizer:
    def __init__(self):
        self.stoi = {}
        self.itos = {}
    
    def __len__(self):
        return len(self.stoi)
    
    def fit_on_moods(self, moods):
        flat = []
        
        Tokenizer.flatten(moods, flat)
        vocab = sorted(set(flat))
        vocab.append('<sos>')
        vocab.append('<eos>')
        vocab.append('<pad>')
        for index, word in enumerate(vocab):
            self.stoi[word] = index
        self.itos = {v : k for k, v in self.stoi.items()}

    def flatten(l, flat):
        """
        Recursively, flatten a list.
        """
        if type(l) != list:
            flat.append(l)
        else:
            for el in l:
                Tokenizer.flatten(el, flat)

    def moods_to_token(self, states, reverse=False):
        """
        Recursively tokenize moods, while preserving the
        structure of the list. When `reverse` is true, the
        method translates the tokens back into the mood strings
        """
        if type(states) != list:
            if reverse:
                return self.itos[states]
            else:
                return self.stoi[states]
        else:
            for index, state in enumerate(states):
                states[index] = self.moods_to_token(state, reverse)
            return states

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_moods(list(moods.values()))

In [13]:
for l in moods.values():
    tokenizer.moods_to_token(l)

## Vectorizing audio features
We now want to organize the audio features into a single vector. Note that we also want to preserve the in which these features appear as well as the order of the songs relative to each other. 

In [46]:
tracks = json.load(open('playlist-tracks-features'))

In [72]:
useful_features = ['danceability', 'energy', 'key', 'loudness', 
                   'mode', 'speechiness', 'acousticness', 'instrumentalness',
                   'liveness', 'valence', 'tempo']
def extract_features(songs):
    """
    We extract the features into a two dimesional array, if 
    l is None, then None is returned.
    """
    if songs == [None]:
        return songs
    songs_features = []
    for song in songs:
        # we first sort the keys so we retain the same order
        # every time.
        keys = sorted(song.keys())
        song_features = []
        for key in keys:
            if key in useful_features:
                song_features.append(song[key])
        songs_features.append(song_features)
    return songs_features

In [76]:
features = {}
for k, v in tracks.items():
    if k == 'iso01' or k == 'iso02':
        continue
    features[k[:5]] = json.dumps(extract_features(v))

# Combine into dataframe
Now with the mood states tokenized and the features discretized into vectors, we can store all of this into a Dataframe. Note that for readability, we also want to store the order of the features, which they were encoded.

In [93]:
for k, v in moods.items():
    moods[k] = json.dumps(v)

In [100]:
df1 = pd.DataFrame(features.values(), index=features.keys(), columns=['features'])
df2 = pd.DataFrame(moods.values(), index=moods.keys(), columns=['moods_states'])
df = df2.join(df1)

In [107]:
torch.save(tokenizer, 'tokenizer.pth')
df.to_csv('train.csv')
json.dump(sorted(useful_features), open('useful_features', 'w+'))