This is where the metadata, feature space, and genres are read in and stored in the "songs" dictionary by `track_id`.

In [83]:
import sklearn
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import SGDClassifier
from random import sample as rand_sample
from random import choice as rand_choice
from sklearn.cluster import KMeans
from maps import FrozenMap, FixedKeyMap

In [2]:
def read_data(file):
    '''
    description

    :param file:        file we're reading in with the data (.csv)

    :return             a 'songs' dictionary
                        key: track_id
                        value(s): a 'data' dictionary, which contains:
                            metadata,
                            label,
                            features,
                            genres
    '''
    
    songs = {}

    with open(file, 'r') as f:

        for song in f:
            
            line = song.split(',')
                        
            # ignore first line + ensure that label exists
            if line[0] != 'label' and line[0]:
                label = int(line[0])
                metadata, data = {}, {}
                genres = []
                track_id = line[1]
                
                features = {}
                features['artist_popularity'] = line[4]
                features['artist_followers'] = line[5]

                metadata['artist_id'] = line[2]
                metadata['artist_name'] = line[3]
                
                # check if genre field has multiple genres or just one
                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0
                
                # if '"' present or next value is a string only containing alphabets,
                # then add to genres list. set count to i+1 when you reach last genre
                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])
                
                # single or no genres, get all other features
                if count == 0:
                    features['instrumentalness'] = float(line[7])
                    features['duration_ms'] = float(line[8])
                    features['time_signature'] = float(line[9])
                    features['acousticness'] = float(line[10])
                    features['speechiness'] = float(line[11])
                    features['energy'] = float(line[12])
                    features['loudness'] = float(line[13])
                    features['tempo'] = float(line[14])
                    features['key'] = float(line[15])
                    features['valence'] = float(line[16])
                    features['danceability'] = float(line[17])
                    features['liveness'] = float(line[18][:-1])
                
                # multiple genres, get all other features using count
                else:
                    features['instrumentalness'] = float(line[count])
                    features['duration_ms'] = float(line[count+1])
                    features['time_signature'] = float(line[count+2])
                    features['acousticness'] = float(line[count+3])
                    features['speechiness'] = float(line[count+4])
                    features['energy'] = float(line[count+5])
                    features['loudness'] = float(line[count+6])
                    features['tempo'] = float(line[count+7])
                    features['key'] = float(line[count+8])
                    features['valence'] = float(line[count+9])
                    features['danceability'] = float(line[count+10])
                    features['liveness'] = float(line[count+11][:-1])
                
                # add metadata, features, genres, and label to data
                data['metadata'] = metadata
                data['features'] = features
                data['genres'] = genres
                data['label'] = label
                
                # add data to songs by track_id
                songs[track_id] = data

    return songs

This is where genres are read in, with each genre getting a unique `genre_id`.

In [3]:
def isFloat(string):
    '''
    description

    :param string:      the string we're testing

    :return             True if the string is a float, False otherwise
    '''
    try:
        float(string)
        return True
    except ValueError:
        return False

def read_genres(file):
    '''
    description
    
    :param file:        file we're reading in with the data (.csv)
    
    :return             a 'genres' dictionary
                        key: genre
                        value: unique genre_id
    '''
    genre_mapping = {}
    genre_id = 0
    
    with open(file, 'r') as f:

        for song in f:

            line = song.split(',')

            if line[0]:

                genres = []
                
                # check if genres field is empty or contains one/multiple genres
                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0
                
                # account for multiple genres
                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])
                
                # for genres in the genres list, ensure that the genre is not a float, create unique genre_id
                for genre in genres:
                    if (genre not in genre_mapping) and ('"' not in genre) and (not isFloat(genre)):
                        genre_mapping[genre] = genre_id
                        genre_id += 1
        
    return genre_mapping

This is where the data is actually read in. We read in the `likes.csv` + `dislikes.csv` data, and merge them into one dictionary, `song_data`.

In [21]:
song_data = FrozenMap(read_data('data.csv'))
genres = FrozenMap(read_genres('data.csv'))
feature_names = tuple(next(iter(song_data.values()))['features'].keys())

In [22]:
# song_data

FrozenMap({'6qNeBtFhG5Ir3VsotFCNrU': {'metadata': {'artist_id': '3SEw2qamdOWyVZtzKxWTTg', 'artist_name': 'The YellowHeads'}, 'features': {'artist_popularity': '0.28', 'artist_followers': '0.004217', 'instrumentalness': 0.861, 'duration_ms': 0.123619573, 'time_signature': 0.8, 'acousticness': 0.005, 'speechiness': 0.0592, 'energy': 0.704, 'loudness': 0.826888183, 'tempo': 0.572721323, 'key': 0.545454545, 'valence': 0.506, 'danceability': 0.773, 'liveness': 0.0864}, 'genres': [], 'label': 1}, '6w51zi8L7yqelzUiFKwDEC': {'metadata': {'artist_id': '7ANeFdhioipksT9lqg0Ay6', 'artist_name': 'Ego Ella May'}, 'features': {'artist_popularity': '0.25', 'artist_followers': '0.001566', 'instrumentalness': 0.0, 'duration_ms': 0.051938837, 'time_signature': 0.8, 'acousticness': 0.766, 'speechiness': 0.274, 'energy': 0.625, 'loudness': 0.774759961, 'tempo': 0.355111854, 'key': 0.272727273, 'valence': 0.448, 'danceability': 0.389, 'liveness': 0.102}, 'genres': ['uk contemporary r&b'], 'label': 1}, '5dL5

In [6]:
# Sanity check
l, dl = 0, 0
for x in song_data.values():
    if x['label'] == 1: l += 1
    else: dl += 1
print(' likes:', l, '\n', 'dislikes:', dl)

 likes: 1919 
 dislikes: 3790


In [26]:
for k in genres:
    print(genres[k], k)

0 artist_genres
1 instrumentalness
2 acousticness
3 speechiness
4 energy
5 tempo
6 key
7 valence
8 danceability
9 uk contemporary r&b
10 future garage
11 house
12 nu disco
13 christmas
14 funk
15 motown
16 soul
17 dub
18 reggae
19 roots reggae
20 hindustani classical
21 indian classical
22 bass music
23 microhouse
24 indie jazz
25 dance pop
26 pop
27 chicago house
28 tech house
29 alternative dance
30 indietronica
31 new rave
32 afro house
33 filter house
34 drum and bass
35 drumfunk
36 jungle
37 neurofunk
38 uk garage
39 world fusion
40 deep house
41 minimal tech house
42 ambient
43 drone
44 electra
45 fluxwork
46 mandible
47 downtempo
48 electronic
49 vaporwave
50 wonky
51 acid idm
52 big beat
53 breakbeat
54 deep soul house
55 hip hop
56 rap
57 underground hip hop
58 aussietronica
59 french indietronica
60 british invasion
61 merseybeat
62 rock
63 deep groove house
64 bow pop
65 focus
66 neoclassical
67 adult standards
68 folk
69 soft rock
70 tropical house
71 deep melodic euro hous

In [8]:
feature_names

('artist_popularity',
 'artist_followers',
 'instrumentalness',
 'duration_ms',
 'time_signature',
 'acousticness',
 'speechiness',
 'energy',
 'loudness',
 'tempo',
 'key',
 'valence',
 'danceability',
 'liveness')

In [113]:
def filter_features(data, discard):
    '''
    Filters out features from data. Does not modify passed-in object (creates a copy).
    
    :param data:        dict of data, same format as song_data
    :param discard:     feature names to discard
                        
    :return             copy of data, with filtered features
    '''
    out = dict(data)
    
    for id_ in data:
        out[id_]['features'] = { k:v for k, v in out[id_]['features'].items() if k not in discard }
    
    return FrozenMap(out)

def split_data(data, p):
    '''
    Splits data into training and validation sets for simple classification.
    
    :param data:    complete labeled data
    :param p:       proportion of data to use for validation
                    
    :return         train_data, validation_data
    '''
    validation_ids = rand_sample(list(data), int(len(data) * p))
    
    validation_data = {k:data[k] for k in validation_ids}
    train_data = {k:v for k, v in data.items() if k not in validation_data}

    return FrozenMap(train_data), FrozenMap(validation_data)


In [183]:
def test_cluster_size(data, max_cluster):
    '''
    description

    :param data:        the data, obtained from read_data()
    :param max_cluster: the max number of clusters to km.inertia_ on

    :return             None (prints km_inertia for each number of clusters)
    '''
    train_data = {k:v for k, v in data.items()}
    x_train = np.array([list(x['features'].values()) for x in train_data.values()])
    
    for i in range(2, max_cluster+1):
        km = KMeans(i, init='random', max_iter=300, random_state=0, n_init=30)
        km.fit(x_train)
        print(i, km.inertia_)

test_cluster_size(song_data, 10)

2 3123.394736313185
3 2481.716632257166
4 2152.068476365455
5 1944.165017191881
6 1824.5853227488217
7 1711.4623613593121
8 1612.5531653294152
9 1536.3092762289932
10 1474.2539202926469


In [184]:
def get_kmeans_clusters(data, n_clusters, songs_by_cluster={}, add_to_songs=True):
    '''
    description

    :param data:        the data, obtained from read_data()
    :param n_clusters:  the number of clusters used in K-means

    :return             updated data with each song's numeric cluster label
    '''
    
    data = dict(data)
    
    if n_clusters in songs_by_cluster.keys(): raise ValueError(str(n_clusters) + ' clusters already computed')
    
    songs_by_cluster[n_clusters] = { i:set() for i in range(n_clusters) }
        
    train_data, track_ids = {k:v for k, v in data.items()}, list(data)

    x_train = np.array([list(x['features'].values()) for x in train_data.values()])
    y_train = [x['label'] for x in train_data.values()]

    km = KMeans(n_clusters, init='random', max_iter=300, random_state=0, n_init=30)
    km.fit(x_train)

    cluster_map = pd.DataFrame()

    cluster_map['data'], cluster_map['cluster'] = x_train.tolist(), km.labels_
    cluster_map['label'], cluster_map['track_id'] = y_train, track_ids

    for track_id in track_ids:
        
        cluster = cluster_map[cluster_map['track_id'] == track_id]['cluster'].tolist()[0]
        
        songs_by_cluster[n_clusters][cluster].add(track_id)
        
        if add_to_songs: data[track_id]['cluster'] = cluster
    
    return FrozenMap(data), FrozenMap(songs_by_cluster)

In [185]:
song_data, songs_by_cluster = get_kmeans_clusters(song_data, 4)
# song_data
songs_by_cluster

FrozenMap({4: {0: {'0icf7lpAs5YeVjeWeyaHKA', '738e4xzLmRkaY1qMDt3mh4', '63OcRZVaLUhgbNwvXkg7J4', '3EAWySfcmKUHey2WZfMdBo', '2KtCDaAIe6MOyDjyeU90M2', '4SajM6eQTpTwi6JzBSWMDh', '4q20xyUnDywZ6g2Pi7ALoj', '2MmhqAtFgdIAaVYXpSN4xY', '5a92b8VExfvFZIUl2YQwjQ', '0Ojomvm7Em38GQv8eMOMpC', '36Wb11URNhFT3qDn6j64xk', '1PADgC899dnHdG6fXHBLS9', '7xeCLv89kIiHIx5ffwbBxf', '6BLoKpgWtQbYqFDFUp9ApX', '3yJIV1U93Bpgw2cRZ6VUiw', '1djH0BdVjBuAKoGBNrZLh3', '0hc0msGemTbCdBcRQidWdj', '6ThcHwNAMmpkSHa218UbEG', '3DPFmwFtV5ElQaTniLOdgk', '4Uy7a8FIQFgfQTSvg7Zzzh', '7GrweTif4geFOTWpCnMrfe', '6s6GuzZTWH2VCGYnBrBGM7', '1Niul1Wd9EUEvz8y6DoutM', '6LORJbO7yxlBRopiPfeXxS', '4tMRfkavPuLWHjnh8YXvNI', '7sCOwMK98Bc3f6hFS0jgkM', '50I80nLHGx7iS960XRmjkm', '6YWixadIV17c61jHGKGLYw', '6Vc5wAMmXdKIAM7WUoEb7N', '4eC3je099P4EwiKwLcgMDM', '3TjhLjBbR74kY1MicLTD9n', '7FX5h0FMJpabBfFpjU13EL', '1dxCB2G5CCwHiGxb4jEXKt', '6V0cdiNRgprv11NGheX87j', '3sJAII5WDYTkP8tKTNObNq', '6dB6YYgBOPtzt37Fa0M8Eh', '2mUnKWALh2hflvgWHbs7fa', '0H3VDNdZfeTxRyU3dL

In [189]:
SUPPORTED_ALGS = ['svc', 'lsvc', 'sgd']
ACTIVE_ALGS = ['sgd']
AL_STRATS = ['random', 'uncertainty']
CLASSES = [0, 1]

def get_xy(song_data, ids=None):
    if ids: song_data = {k:song_data[k] for k in ids}
    return [x['features'] for x in song_data.values()], [x['label'] for x in song_data.values()]

def get_default_vectorizer(song_data_x):
    vect = DictVectorizer(sort=True)
    return vect.fit(song_data_x)

def get_default_learner(alg):
    if self.algorithm == 'svc':
            return SVC(gamma='auto') # defaults
        
    elif self.algorithm == 'lsvc':
        return LinearSVC(loss='hinge', penalty='l2') # defaults

    elif self.algorithm == 'sgd':
        return SGDClassifier(loss='hinge', penalty='l2') # defaults
    
    else: raise ValueError('unknown algorithm: ' + str(alg))
    

In [34]:
song_data_x, song_data_y = get_xy(song_data)
default_vectorizer = get_default_vectorizer(song_data_x)

In [192]:
class Classifier(object):
    def __init__(self, algorithm, train_data, vectorizer, is_active, active_init_n=None, clusters=None, al_strat=None):
        '''
        description
        
        :param algorithm:   the name of the learning algorithm to use
        :param train_data:  dict
        :param vectorizer:  DictVectorizer, fit on train_data features
        :param is_active:   bool
        :param clusters:    { cluster1:[track_ids], cluster2:[track_ids], ...}
                            
        :return             Classifier instance
        '''
        
        if algorithm not in SUPPORTED_ALGS:
            raise ValueError('Unsupported algorithm: ' + str(algorithm))
        if al_strat not in AL_STRATS:
            raise ValueError('Unsupported AL strategy: ' + str(al_strat))
        
        self.algorithm = algorithm
        self.train_data = train_data
        self.vectorizer = vectorizer
        self._is_active = is_active
        self._is_online = algorithm in ACTIVE_ALGS
        self._uses_clusters = bool(clusters)
        self.strategy = al_strat
        self.clusters = clusters
        self.active_init_n = active_init_n
        
        if not (self.uses_clusters() and self.is_online()):
            raise ValueError('Algorithm must be online if using clusters.')
        
        self.init_learners(clusters)
    
    
    def init_learners(self, clusters):
        
        if not self.is_active():
            self.x_train, self.y_train = get_xy(self.train_data)
        
        else:
            
            self.unseen_ids = list(self.train_data.keys())
            self.train_ids = set()
            
            if not self.uses_clusters():
                self.learner = get_default_learner(self.algorithm)
                self.active_learn(n=self.active_init_n, init=True)
            
            else:
                self.learners = { i:get_default_learner(self.algorithm) for i in range(len(self.clusters)) }
                
    
    
    def is_active(self):
        return self._is_active
    
    
    def is_online(self):
        return self._is_online
    
    
    def uses_clusters(self):
        return self._uses_clusters
    
    
    def transform(self, x):
        '''
        Vectorizes list of feature(s)
        '''
        return self.vectorizer.transform(x)
    
    
    def fit(self):
        '''
        '''
        if not self.is_active():
            self.learner.fit(self.vectorizer.transform(self.x_train), self.y_train)
        else:
            self.active_learn()
    
    
    def active_learn(self, n=1, init=False, learner_id=None):
        
        # TODO: this sometimes fails on initialization for
        # offline learners, when the random sample only contains
        # one class. 
        # Happens rarely enough that we can just ignore for now.
        
        strategy = self.strategy
        if init: strategy = 'random'
        
        if strategy == 'random':
            sample_ids = rand_sample(self.unseen_ids, n)
            for song_id in sample_ids: self.unseen_ids.remove(song_id)
            self.train_ids.update(sample_ids)
        
        elif strategy == 'uncertainty':
            unseen_x, unseen_y = get_xy(self.train_data, ids=self.unseen_ids)
            unseen_x_scores = self.learner.decision_function(self.transform(unseen_x))
            
            # get index of smallest absolute value (probability or distance from decision boundary)
            # from unseen_x_scores, pop the corresponding entry from self.unseen_ids
            sample_ids = [self.unseen_ids.pop(np.argmin(np.abs(unseen_x_scores)))]
            self.train_ids.update(sample_ids)
        
        else:
            raise ValueError('AL strategy not implemented: ' + str(self.strategy))
        
        if not self.is_online():
            sample_x, sample_y = get_xy(self.train_data, ids=self.train_ids)
            self.learner.fit(self.transform(sample_x), sample_y)
        else:
            if not self.uses_clusters():
                sample_x, sample_y = get_xy(self.train_data, ids=sample_ids)
                if init: self.learner.partial_fit(self.transform(sample_x), sample_y, CLASSES)
                else: self.learner.partial_fit(self.transform(sample_x), sample_y)
            else:
                raise ValueError('not implemented')
    
    
    def predict(self, x, learner=None):
        if learner:
            return learner.predict(x)[0]
        else:
            return self.learner.predict(x)[0]
    
    
    def validate(self, validation_data):
        '''
        Predicts on all instances in validation_data. Returns accuracy.
        '''
        x_test, y_test = get_xy(validation_data)
        
        correct = 0
        for i in range(len(validation_data)):
            if self.predict(self.transform([x_test[i]])) == y_test[i]: correct += 1
        
        return correct / len(validation_data)


In [181]:
def run_active_experiment(iterations, alg, data, p, init_n, strat='random', discard=None):
    '''
    Trains :alg: on song_data (with or without filtering features), splits into train/validation,
    and returns accuracy on validation data.
    '''
    if discard: data = filter_features(data, discard)
    
    train_data, validation_data = split_data(data, p)
    
    clf = Classifier(alg, train_data, default_vectorizer, True, active_init_n=init_n, al_strat=strat)
    accuracies = [clf.validate(test_vd)]
    
    for i in range(iterations):
        clf.active_learn()
        accuracies.append(clf.validate(test_vd))
    
    return accuracies

def run_active_suite():
    for alg in SUPPORTED_ALGS:
        for strat in AL_STRATS:
            init_n = 5 if alg in ACTIVE_ALGS else 8
            accs = run_active_experiment(50, alg, song_data, 0.1, init_n, strat=strat)
            print(alg, 'accs w/', strat)
            for i in range(len(accs)):
                print(int(accs[i]*10000)/100, end='\t')
            print()

def run_clusters_experiment(iterations, alg, data, p, init_n, num_clusters, strat='random', discard=None):
    '''
    Trains :alg: on song_data (with or without filtering features), splits into train/validation,
    and returns accuracy on validation data.
    '''
    if discard: data = filter_features(data, discard)
    
    train_data, validation_data = split_data(data, p)
    
    clf = Classifier(
        alg,
        train_data,
        default_vectorizer,
        True,
        active_init_n=init_n,
        al_strat=strat,
        clusters=songs_by_cluster[num_clusters]
    )
    accuracies = [clf.validate(test_vd)]
    
    for i in range(iterations):
        clf.active_learn()
        accuracies.append(clf.validate(test_vd))
    
    return accuracies
    
def run_clusters_suite():
    for alg in ACTIVE_ALGS:
        for strat in AL_STRATS:
            accs = run_active_experiment(50, alg, song_data, 0.1, 5, 4, strat=strat)
            print(alg, 'accs w/', strat)
            for i in range(len(accs)):
                print(int(accs[i]*10000)/100, end='\t')
            print()

In [182]:
run_active_suite()
# run_clusters_suite()

svc accs w/ random
67.22	79.92	67.22	67.22	67.22	67.22	67.22	84.75	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	67.22	89.13	32.77	89.13	67.22	67.22	67.22	67.22	67.22	
svc accs w/ uncertainty
32.77	72.48	67.22	87.81	67.22	86.94	32.77	87.81	32.77	79.05	32.77	65.55	67.22	73.18	67.22	85.53	67.22	88.51	32.77	85.97	32.77	78.17	67.22	87.29	67.22	88.69	67.22	88.6	67.22	88.78	32.77	88.6	32.77	88.78	67.22	88.25	32.77	88.25	67.22	88.78	32.77	88.34	67.22	88.6	32.77	88.78	32.77	88.78	67.22	88.78	32.77	
lsvc accs w/ random
70.81	80.63	79.31	76.59	84.22	83.26	81.94	82.29	85.8	86.06	87.11	86.94	87.11	87.81	87.9	86.32	85.18	85.8	86.85	86.94	87.55	86.59	86.59	86.67	86.94	87.02	86.94	87.37	87.2	87.2	87.02	85.8	85.36	85.27	85.1	85.18	85.18	85.36	85.27	85.27	85.27	85.36	85.62	85.8	85.8	85.97	86.76	86.59	86.67	87.37	87.37	
lsvc accs w/ uncertainty
66.

In [108]:
def run_benchmark(alg, data, p, discard=None):
    '''
    Trains :alg: on song_data (with or without filtering features), splits into train/validation,
    and returns accuracy on validation data.
    '''
    if discard: data = filter_features(data, discard)
    
    train_data, validation_data = split_data(data, p)
    
    clf = Classifier(alg, train_data, default_vectorizer, False)
    clf.fit()
    
    return clf, train_data, validation_data

In [97]:
test_clf, test_td, test_vd = run_benchmark('svc', song_data, 0.2)
print('svc acc:', test_clf.validate(test_vd))
test_clf, test_td, test_vd = run_benchmark('lsvc', song_data, 0.2)
print('lsvc acc:', test_clf.validate(test_vd))
test_clf, test_td, test_vd = run_benchmark('sgd', song_data, 0.2)
print('sgd acc:', test_clf.validate(test_vd))

svc acc: 0.64943032427695
lsvc acc: 0.9307624890446976
sgd acc: 0.9246275197195443
