This is where the metadata, feature space, and genres are read in and stored in the "songs" dictionary by `track_id`.

In [19]:
import sklearn
import numpy as np
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from random import sample as rand_sample
from random import choice as rand_choice
from sklearn.cluster import KMeans
from maps import FrozenMap, FixedKeyMap

In [2]:
def read_data(file):
    '''
    description

    :param file:        file we're reading in with the data (.csv)

    :return             a 'songs' dictionary
                        key: track_id
                        value(s): a 'data' dictionary, which contains:
                            metadata,
                            label,
                            features,
                            genres
    '''
    
    songs = {}

    with open(file, 'r') as f:

        for song in f:
            
            line = song.split(',')
                        
            # ignore first line + ensure that label exists
            if line[0] != 'label' and line[0]:
                label = int(line[0])
                metadata, data = {}, {}
                genres = []
                track_id = line[1]
                
                features = {}
                features['artist_popularity'] = line[4]
                features['artist_followers'] = line[5]

                metadata['artist_id'] = line[2]
                metadata['artist_name'] = line[3]
                
                # check if genre field has multiple genres or just one
                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0
                
                # if '"' present or next value is a string only containing alphabets,
                # then add to genres list. set count to i+1 when you reach last genre
                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])
                
                # single or no genres, get all other features
                if count == 0:
                    features['instrumentalness'] = float(line[7])
                    features['duration_ms'] = float(line[8])
                    features['time_signature'] = float(line[9])
                    features['acousticness'] = float(line[10])
                    features['speechiness'] = float(line[11])
                    features['energy'] = float(line[12])
                    features['loudness'] = float(line[13])
                    features['tempo'] = float(line[14])
                    features['key'] = float(line[15])
                    features['valence'] = float(line[16])
                    features['danceability'] = float(line[17])
                    features['liveness'] = float(line[18][:-1])
                
                # multiple genres, get all other features using count
                else:
                    features['instrumentalness'] = float(line[count])
                    features['duration_ms'] = float(line[count+1])
                    features['time_signature'] = float(line[count+2])
                    features['acousticness'] = float(line[count+3])
                    features['speechiness'] = float(line[count+4])
                    features['energy'] = float(line[count+5])
                    features['loudness'] = float(line[count+6])
                    features['tempo'] = float(line[count+7])
                    features['key'] = float(line[count+8])
                    features['valence'] = float(line[count+9])
                    features['danceability'] = float(line[count+10])
                    features['liveness'] = float(line[count+11][:-1])
                
                # add metadata, features, genres, and label to data
                data['metadata'] = metadata
                data['features'] = features
                data['genres'] = genres
                data['label'] = label
                
                # add data to songs by track_id
                songs[track_id] = data

    return songs

This is where genres are read in, with each genre getting a unique `genre_id`.

In [3]:
def isFloat(string):
    '''
    description

    :param string:      the string we're testing

    :return             True if the string is a float, False otherwise
    '''
    try:
        float(string)
        return True
    except ValueError:
        return False

def read_genres(file):
    '''
    description
    
    :param file:        file we're reading in with the data (.csv)
    
    :return             a 'genres' dictionary
                        key: genre
                        value: unique genre_id
    '''
    genre_mapping = {}
    genre_id = 0
    
    with open(file, 'r') as f:

        for song in f:

            line = song.split(',')

            if line[0]:

                genres = []
                
                # check if genres field is empty or contains one/multiple genres
                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0
                
                # account for multiple genres
                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])
                
                # for genres in the genres list, ensure that the genre is not a float, create unique genre_id
                for genre in genres:
                    if (genre not in genre_mapping) and ('"' not in genre) and (not isFloat(genre)):
                        genre_mapping[genre] = genre_id
                        genre_id += 1
        
    return genre_mapping

This is where the data is actually read in. We read in the `likes.csv` + `dislikes.csv` data, and merge them into one dictionary, `song_data`.

In [21]:
song_data = FrozenMap(read_data('data.csv'))
genres = FrozenMap(read_genres('data.csv'))
feature_names = tuple(next(iter(song_data.values()))['features'].keys())

In [22]:
# song_data

FrozenMap({'6qNeBtFhG5Ir3VsotFCNrU': {'metadata': {'artist_id': '3SEw2qamdOWyVZtzKxWTTg', 'artist_name': 'The YellowHeads'}, 'features': {'artist_popularity': '0.28', 'artist_followers': '0.004217', 'instrumentalness': 0.861, 'duration_ms': 0.123619573, 'time_signature': 0.8, 'acousticness': 0.005, 'speechiness': 0.0592, 'energy': 0.704, 'loudness': 0.826888183, 'tempo': 0.572721323, 'key': 0.545454545, 'valence': 0.506, 'danceability': 0.773, 'liveness': 0.0864}, 'genres': [], 'label': 1}, '6w51zi8L7yqelzUiFKwDEC': {'metadata': {'artist_id': '7ANeFdhioipksT9lqg0Ay6', 'artist_name': 'Ego Ella May'}, 'features': {'artist_popularity': '0.25', 'artist_followers': '0.001566', 'instrumentalness': 0.0, 'duration_ms': 0.051938837, 'time_signature': 0.8, 'acousticness': 0.766, 'speechiness': 0.274, 'energy': 0.625, 'loudness': 0.774759961, 'tempo': 0.355111854, 'key': 0.272727273, 'valence': 0.448, 'danceability': 0.389, 'liveness': 0.102}, 'genres': ['uk contemporary r&b'], 'label': 1}, '5dL5

In [6]:
# Sanity check
l, dl = 0, 0
for x in song_data.values():
    if x['label'] == 1: l += 1
    else: dl += 1
print(' likes:', l, '\n', 'dislikes:', dl)

 likes: 1919 
 dislikes: 3790


In [26]:
for k in genres:
    print(genres[k], k)

0 artist_genres
1 instrumentalness
2 acousticness
3 speechiness
4 energy
5 tempo
6 key
7 valence
8 danceability
9 uk contemporary r&b
10 future garage
11 house
12 nu disco
13 christmas
14 funk
15 motown
16 soul
17 dub
18 reggae
19 roots reggae
20 hindustani classical
21 indian classical
22 bass music
23 microhouse
24 indie jazz
25 dance pop
26 pop
27 chicago house
28 tech house
29 alternative dance
30 indietronica
31 new rave
32 afro house
33 filter house
34 drum and bass
35 drumfunk
36 jungle
37 neurofunk
38 uk garage
39 world fusion
40 deep house
41 minimal tech house
42 ambient
43 drone
44 electra
45 fluxwork
46 mandible
47 downtempo
48 electronic
49 vaporwave
50 wonky
51 acid idm
52 big beat
53 breakbeat
54 deep soul house
55 hip hop
56 rap
57 underground hip hop
58 aussietronica
59 french indietronica
60 british invasion
61 merseybeat
62 rock
63 deep groove house
64 bow pop
65 focus
66 neoclassical
67 adult standards
68 folk
69 soft rock
70 tropical house
71 deep melodic euro hous

In [8]:
feature_names

('artist_popularity',
 'artist_followers',
 'instrumentalness',
 'duration_ms',
 'time_signature',
 'acousticness',
 'speechiness',
 'energy',
 'loudness',
 'tempo',
 'key',
 'valence',
 'danceability',
 'liveness')

In [33]:
def get_xy(song_data, ids=None):
    if ids: song_data = {k:song_data[k] for k in song_data}
    return [x['features'] for x in song_data.values()], [x['label'] for x in song_data.values()]

def get_default_vectorizer(song_data_x):
    vect = DictVectorizer(sort=True)
    return vect.fit(song_data_x)
    

In [34]:
song_data_x, song_data_y = get_xy(song_data)
default_vectorizer = get_default_vectorizer(song_data_x)

In [47]:
class Classifier(object):
    def __init__(self, algorithm, train_data, vectorizer, is_online, online_init_n=5, clusters=None):
        '''
        description
        
        :param algorithm:   the name of the learning algorithm to use
        :param train_data:  dict
        :param vectorizer:  DictVectorizer, fit on train_data features
        :param is_online:   bool
        :param clusters:    { cluster1:[track_ids], cluster2:[track_ids], ...}
                            
        :return             Classifier instance
        '''
         
        if is_online and algorithm not in ['sgd']:
            raise ValueError('Invalid online algorithm: ' + str(algorithm))
        
        if algorithm == 'svm':
            
            self.learner = LinearSVC(loss='hinge', penalty='l2') # defaults
        
        elif algorithm == 'sgd':
            
            self.learner = SGDClassifier(loss='hinge', penalty='l2') # defaults
        
        else: raise ValueError('Unsupported algorithm: ' + str(algorithm))
        
        self.algorithm = algorithm
        self.train_data = train_data
        self.vectorizer = vectorizer
        self._is_online = is_online
        self._uses_clusters = bool(clusters)
        if self._uses_clusters: self.clusters = clusters
            
        if not is_online:
            temp, self.y_train = get_xy(train_data)
            self.x_train = self.vectorizer.transform(temp)
        else:
            self.init_online(online_init_n)
    
    
    def init_online(self, n):
        '''
        Initializes online learner
        '''
        
        self.train_ids = set(self.train_data.keys())
        self.used_ids = set()
        
        if self._uses_clusters:
           pass
        else:
            sample_ids = rand_sample(self.train_ids, n)
            self.train_ids.difference_update(sample_ids)
            self.used_ids.update(sample_ids)
            sample_x, sample_y = get_xy(self.train_data, ids=sample_ids)
            self.fit(sample_x, sample_y)
    
    
    def transform(self, x):
        return self.vectorizer.transform(x)
    
    
    def is_online(self):
        return self._is_online
    
    
    def uses_clusters(self):
        return self._uses_clusters
    
    
    def fit_all(self):
        if self._is_online: raise AttributeError('Cannot fit all, classifier is online.')
        self.learner.fit(self.x_train, self.y_train)
    
    
    def fit(self, x, y):
        '''
        '''
        if not self._is_online: raise AttributeError('Cannot online fit, classifier is not online.')
        self.learner.partial_fit(x, y)
    
    
    def predict(self, x):
        if self.algorithm in ['svm', 'sgd']:
            return self.learner.predict(x)[0]
        raise AttributeError('Prediction not implemented for: ' + self.algorithm)
    
    
    def validate(self, validation_data):
        '''
        Predicts on all instances in validation_data. Returns accuracy.
        '''
        x_test, y_test = get_xy(validation_data)
        
        correct = 0
        for i in range(len(validation_data)):
            if self.predict(self.transform([x_test[i]])) == y_test[i]: correct += 1
        
        return correct / len(validation_data)


In [48]:
def filter_features(data, discard):
    '''
    Filters out features from data. Does not modify passed-in object (creates a copy).
    
    :param data:        dict of data, same format as song_data
    :param discard:     feature names to discard
                        
    :return             copy of data, with filtered features
    '''
    out = dict(data)
    
    for id_ in data:
        out[id_]['features'] = { k:v for k, v in out[id_]['features'].items() if k not in discard }
    
    return FrozenMap(out)

def split_data(data, p):
    '''
    Splits data into training and validation sets for simple classification.
    
    :param data:    complete labeled data
    :param p:       proportion of data to use for validation
                    
    :return         train_data, validation_data
    '''
    validation_labels = rand_sample(list(data), int(len(data) * p))
    
    validation_data = {k:data[k] for k in validation_labels}
    train_data = {k:v for k, v in data.items() if k not in validation_data}

    return FrozenMap(train_data), FrozenMap(validation_data)


In [49]:
def run_benchmark(alg, data, p, discard=None):
    '''
    Trains :alg: on song_data (with or without filtering features), splits into train/validation,
    and returns accuracy on validation data.
    '''
    if discard: data = filter_features(data, discard)
    
    train_data, validation_data = split_data(data, p)
    
    clf = Classifier(alg, train_data, default_vectorizer, False)
    clf.fit_all()
    
    return clf, train_data, validation_data

In [51]:
test_clf, test_td, test_vd = run_benchmark('svm', song_data, 0.2)
print('svm acc:', test_clf.validate(test_vd))
test_clf, test_td, test_vd = run_benchmark('sgd', song_data, 0.2)
print('sgd acc:', test_clf.validate(test_vd))

svm acc: 0.9272567922874672
sgd acc: 0.9184925503943909


In [13]:
def test_cluster_size(data, max_cluster):
    '''
    description

    :param data:        the data, obtained from read_data()
    :param max_cluster: the max number of clusters to km.inertia_ on

    :return             None (prints km_inertia for each number of clusters)
    '''
    train_data = {k:v for k, v in data.items()}
    x_train = np.array([list(x['features'].values()) for x in train_data.values()])
    
    for i in range(2, max_cluster+1):
        km = KMeans(i, init='random', max_iter=300, random_state=0, n_init=30)
        km.fit(x_train)
        print(i, km.inertia_)

# test_cluster_size(song_data, 10)

In [14]:
def get_kmeans_clusters(data, n_clusters, songs_by_cluster={}, add_to_songs=True):
    '''
    description

    :param data:        the data, obtained from read_data()
    :param n_clusters:  the number of clusters used in K-means

    :return             updated data with each song's numeric cluster label
    '''
    
    data = dict(data)
    
    if n_clusters in songs_by_cluster.keys(): raise ValueError(n_clusters, 'clusters key already exists')
    
    songs_by_cluster[n_clusters] = { i:[] for i in range(n_clusters) }
        
    train_data, track_ids = {k:v for k, v in data.items()}, list(data)

    x_train = np.array([list(x['features'].values()) for x in train_data.values()])
    y_train = [x['label'] for x in train_data.values()]

    km = KMeans(n_clusters, init='random', max_iter=300, random_state=0, n_init=30)
    km.fit(x_train)

    cluster_map = pd.DataFrame()

    cluster_map['data'], cluster_map['cluster'] = x_train.tolist(), km.labels_
    cluster_map['label'], cluster_map['track_id'] = y_train, track_ids

    for track_id in track_ids:
        
        cluster = cluster_map[cluster_map['track_id'] == track_id]['cluster'].tolist()[0]
        
        songs_by_cluster[n_clusters][cluster].append(track_id)
        
        if add_to_songs: data[track_id]['cluster'] = cluster
    
    return FrozenMap(data), FrozenMap(songs_by_cluster)

song_data, songs_by_cluster = get_kmeans_clusters(song_data, 10)

In [15]:
# song_data
songs_by_cluster

{10: {0: ['3u2R3fpfSNwvEWDzSVcXVS',
   '7fRvTEbngyFprSPRvRDyM8',
   '4x4g6v6b54vD1LmjreABrY',
   '4rv3UTtKes3ICbrUAV3XWy',
   '6HsWWGN1Nfe1euQ8L9EmMq',
   '7A2yhrdnFspRjpX8RC8UW1',
   '1kDqntrhTIOeFyuQAJfTs0',
   '6wUU6ff7O1jlE4NOrv01gI',
   '61NZThcDTvRLnPPd07uFds',
   '3qNdddxA3KCQbgS73aWeca',
   '4AHSHvnlww9MLfgcBraK4j',
   '0CJ5USBxaX8XY4ozhY0qfR',
   '4jNQkWhuzqrbqQuqanFFJ6',
   '3NerEd57tNaMeaR57mAoJb',
   '5GDkeh0IH4VH302OhO59HM',
   '3Z49dMHUTRIvYw9ufUtTA0',
   '5Vdrq1akDDxNMEybDvmBDa',
   '5SgLcovpN7IBBKH41XaUmc',
   '4TQmCn0e9L7XTiKY4Hk8X4',
   '5i39UY35UedtvmZWa9kYOG',
   '6GeSp1xUWQPgFBagPMGaAr',
   '4UdI4OdnxRA5pHOWZwF2bO',
   '7JjzVELpOxqsKP4jNcZXlA',
   '6DaqgMd5mFlMdDJMk2wvLQ',
   '3SRpyScdBv9g2BzzUPRAwa',
   '7GrweTif4geFOTWpCnMrfe',
   '6k9qruI95VfOsENJSM4lCI',
   '61g7oQKm8DFEL7r2g9LS8x',
   '1gHXm48604ZX8hTtcKEfGG',
   '1gZ65GKX8xsGxucrC7XyHG',
   '5A7FKtfUqykPsmQ1wMZLDo',
   '7B1h0Com19ccdjU8ukJdD4',
   '0CCeQR62nVFToNKesjQgSc',
   '738e4xzLmRkaY1qMDt3mh4',
   '3ux