# Feature Extraction and Unsupervised Learning with Tensorboard
This notebook extracts features from a directory of midi files using Music21's jSymbolic feature extractors. This is a compute-heavy task and can take between 10-30 seconds per midi file. Once extraction has been performed, the results can be cached (using pickle) and dimensionality reduction/visualization can be done using tensorboard's embedding projector.

## Imports

In [1]:
import time, os, csv, json, math, sys, operator
sys.path.append('../python')

import utils
import dill as pickle
from pprint import pprint
from multiprocessing import Pool as ThreadPool
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

## Extract Features
It is recommended to run this function sparingly and cache the results with pickle below to refrain from unecessary compute.

In [None]:
symlink_dir = '../../data/query_symlinks'

num_threads = 8
pool = ThreadPool(num_threads)

start_time = time.time()
extracted_features = pool.map(utils.extract_features, [os.path.join(symlink_dir, n) for n in os.listdir(symlink_dir)][0:8])
print('Finished in {:.2f} seconds'.format(time.time() - start_time))

## Save/Load cached feature extraction

In [2]:
## save cache
# with open('../../data/extracted_features/4000_midi_files_features.pickle', 'w') as f:
#     pickle.dump(extracted_features, f)

## load cache
with open('../../data/extracted_features/4000_midi_files_features.pickle', 'r') as f:
    extracted_features = pickle.load(f)
    

In [3]:
# convert old feature extraction format
for i, feat in enumerate(extracted_features):
    if feat is not None:
        assert(len(feat[0]) == 1)
        extracted_features[i] = list(extracted_features[i]) # convert from immutable tuple
        extracted_features[i][0] = extracted_features[i][0][0]

## Load MSD Cache

In [4]:
start_time = time.time()
with open('../../data/msd.pickle', 'r') as f:
    msd = pickle.load(f)
print('Loaded in {:.2f} seconds'.format(time.time() - start_time))

Loaded in 11.28 seconds


## Tensorboard Labels/Metadata
Create a `metadata.tsv` file to assosciate msd track metadata to data points in tensorboard.

In [5]:
LOG_DIR='../../data/tmp_logdir'
with open(os.path.expanduser('~') + '/Documents/code/midi-dataset/data/match_scores.json', 'r') as f:
    match_scores = json.load(f)

In [6]:
def get_mid_to_track(match_scores):
    mid_to_track = {}
    for k, vals in match_scores.items():
        for v in vals:
            if not v in mid_to_track:
                mid_to_track[v] = {
                    'track': k,
                    'confidence': match_scores[k][v]
                }
            # if the new confidence score is higher than the last
            # update the track it points to
            elif match_scores[k][v] > mid_to_track[v]['confidence']:
                mid_to_track[v]['track'] = k
    return mid_to_track

def get_track_to_msd(msd):
    return { m['track_id']: m for m in msd }

def save_features_to_tsv(filename, features, mid_to_track, track_to_msd):
    with open(filename, 'w') as csvfile:
        
        fieldnames = ['path', 
                      'song_year',
                      'song_title',
                      'song_time_signature',
                      'song_tempo',
                      'song_key',
                      'song_mode',
                      'song_loudness',
                      'song_energy',
                      'song_duration',
                      'song_danceability',
                      'song_hotttnesss',
                      'artist_name',
                      'artist_terms',
                      'artist_mbtags',
                      'artist_hotttnesss',
                      'artist_location']

        writer = csv.DictWriter(csvfile, delimiter='\t', fieldnames=fieldnames)

        writer.writeheader()
        for f in extracted_features:
            if f is not None:
                path = f[1][-1]
                basename = os.path.basename(path)[0:-4]
                if basename in mid_to_track:
                    track = mid_to_track[basename]['track']
                    d = {k: v for k, v in track_to_msd[track].items() if k in fieldnames}
                    for k, v in d.items():
                        if isinstance(v, np.ndarray):
                            d[k] = v.tolist()[0] if len(v.tolist()) > 0 else ''#','.join(v.tolist())
                        try:
                            f = float(v)
                            if math.isnan(v):
                                d[k] = 0.0
                        except:
                            pass
                    writer.writerow(d)                     

mid_to_track = utils.get_midi_to_track_lut() #get_mid_to_track(match_scores)
track_to_msd = get_track_to_msd(msd)
tsv = save_features_to_tsv(os.path.join(LOG_DIR, 'metadata.tsv'), extracted_features, mid_to_track, track_to_msd)

## Analyze and remove unnecessary features
Clean up the results from the music21 feature extraction.

In [None]:
# vec = []
# for f in extracted_features:
#     if f is not None:
#         arr = f[0]
#         arr.pop(0) # first element is an empty string
#         arr.pop(-1) # last element is an empty string
#         vec.append(np.array(arr))
# embeddings = np.asarray(vec)
# print(embeddings[0])

In [7]:
# removes (in-place) features that have non-zero values less
# than percent_threshold
def remove_weak_features(extracted_features, min_percent_of_tracks_with_feature=0.05):
    feat_labels = { k: 0 for k in extracted_features[0][1] }
    for track in extracted_features:
        if track is not None:
            assert(len(track[1]) == len(track[0]))
            # for each feature label in the track
            for i, label in enumerate(track[1]):
                # if the corresponding feature value is not zero
                if track[0][i] != 0:
                    # increment its count in our dict
                    feat_labels[label] = feat_labels[label] + 1
    sorted_x = sorted(feat_labels.items(), key=operator.itemgetter(1), reverse=True)
    # keep only the features that at least 10%
    # of all tracks have a non-zero value for
    min_percent_of_tracks_with_feature = 0.05
    min_count = math.ceil(len(extracted_features) * min_percent_of_tracks_with_feature)
    feats_to_remove = [k for k, v in feat_labels.iteritems() if v < min_count]
    indicies_to_remove = [extracted_features[0][1].index(x) for x in feats_to_remove]
    print('Removed {}/{} features, or {:.2f}%. New feature size is {}.'
              .format(len(indicies_to_remove), 
                      len(feat_labels.keys()), 
                      float(len(indicies_to_remove))/float(len(feat_labels.keys())),
                      len(feat_labels.keys()) - len(indicies_to_remove)))
    for track in extracted_features:
        if track is not None:
            track[1] = [i for j, i in enumerate(track[1]) if j not in indicies_to_remove]
            track[0] = [i for j, i in enumerate(track[0]) if j not in indicies_to_remove]

In [8]:
# remove the Identifier and Path key => value pairs from
# the extracted features
# WARNING: the Path is required to create the tensorboard
# embeddings metadata, so don't run this before creating
# those embeddings
for track in extracted_features:
    if track is not None:
        track[0].pop(0)
        track[0].pop(-1)
        track[1].pop(0)
        track[1].pop(-1)
        
remove_weak_features(extracted_features)

Removed 152/309 features, or 0.49%. New feature size is 157.


## Tensorboard Embeddings
Expose the embeddings to be used tensorboard.

In [9]:
sess = tf.Session()
embeddings = np.asarray([x[0] for x in extracted_features if x is not None])
# create embeddings var
emb = tf.Variable(embeddings, name='embeddings')

# embedding projector
summary_writer = tf.train.SummaryWriter(LOG_DIR)
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
embedding.tensor_name = emb.name
embedding.metadata_path = os.path.join(LOG_DIR, 'metadata.tsv')
projector.visualize_embeddings(summary_writer, config)

# init and run the session
init = tf.global_variables_initializer()
sess.run(init)

# save checkpoint
saver = tf.train.Saver()
saver.save(sess, os.path.join(LOG_DIR, "model.ckpt"), 0)

Instructions for updating:
Please switch to tf.summary.FileWriter. The interface and behavior is the same; this is just a rename.


'../../data/tmp_logdir/model.ckpt-0'