This is where the metadata, feature space, and genres are read in and stored in the "songs" dictionary by `track_id`.

In [4]:
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import LinearSVC
from random import sample as rand_sample


In [34]:
def read_data(file):
    
    songs = {}

    with open(file, 'r') as f:

        for song in f:
            
            line = song.split(',')
                        
            if line[0] != 'label' and line[0]:
                label = int(line[0])
                metadata, data = {}, {}
                genres = []
                track_id = line[1]
                
                features = {}
                features['artist_popularity'] = line[4]
                features['artist_followers'] = line[5]

                metadata['artist_id'] = line[2]
                metadata['artist_name'] = line[3]

                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0

                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])

                if count == 0:
                    features['instrumentalness'] = float(line[7])
                    features['duration_ms'] = float(line[8])
                    features['time_signature'] = float(line[9])
                    features['acousticness'] = float(line[10])
                    features['speechiness'] = float(line[11])
                    features['energy'] = float(line[12])
                    features['loudness'] = float(line[13])
                    features['tempo'] = float(line[14])
                    features['key'] = float(line[15])
                    features['valence'] = float(line[16])
                    features['danceability'] = float(line[17])
                    features['liveness'] = float(line[18][:-1])
                    
                else:
                    features['instrumentalness'] = float(line[count])
                    features['duration_ms'] = float(line[count+1])
                    features['time_signature'] = float(line[count+2])
                    features['acousticness'] = float(line[count+3])
                    features['speechiness'] = float(line[count+4])
                    features['energy'] = float(line[count+5])
                    features['loudness'] = float(line[count+6])
                    features['tempo'] = float(line[count+7])
                    features['key'] = float(line[count+8])
                    features['valence'] = float(line[count+9])
                    features['danceability'] = float(line[count+10])
                    features['liveness'] = float(line[count+11][:-1])
                
                data['metadata'] = metadata
                data['features'] = features
                data['genres'] = genres
                data['label'] = label

                songs[track_id] = data

    return songs

This is where genres are read in, with each genre getting a unique `genre_id`.

In [35]:
def isFloat(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def read_genres(file):
    
    genre_mapping = {}
    genre_id = 0
    
    with open(file, 'r') as f:

        for song in f:

            line = song.split(',')

            if line[0]:

                genres = []

                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0

                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])

                for genre in genres:
                    if (genre not in genre_mapping) and ('"' not in genre) and (not isFloat(genre)):
                        genre_mapping[genre] = genre_id
                        genre_id += 1
        
    return genre_mapping

This is where the data is actually read in. We read in the `likes.csv` + `dislikes.csv` data, and merge them into one dictionary, `song_data`.

In [36]:
song_data = read_data('data.csv')
genres = read_genres('data.csv')
feature_names = tuple(next(iter(song_data.values()))['features'].keys())

In [37]:
song_data

{'6qNeBtFhG5Ir3VsotFCNrU': {'metadata': {'artist_id': '3SEw2qamdOWyVZtzKxWTTg',
   'artist_name': 'The YellowHeads'},
  'features': {'artist_popularity': '0.28',
   'artist_followers': '0.004217',
   'instrumentalness': 0.861,
   'duration_ms': 0.123619573,
   'time_signature': 0.8,
   'acousticness': 0.005,
   'speechiness': 0.0592,
   'energy': 0.704,
   'loudness': 0.826888183,
   'tempo': 0.572721323,
   'key': 0.545454545,
   'valence': 0.506,
   'danceability': 0.773,
   'liveness': 0.0864},
  'genres': [],
  'label': 1},
 '6w51zi8L7yqelzUiFKwDEC': {'metadata': {'artist_id': '7ANeFdhioipksT9lqg0Ay6',
   'artist_name': 'Ego Ella May'},
  'features': {'artist_popularity': '0.25',
   'artist_followers': '0.001566',
   'instrumentalness': 0.0,
   'duration_ms': 0.051938837,
   'time_signature': 0.8,
   'acousticness': 0.766,
   'speechiness': 0.274,
   'energy': 0.625,
   'loudness': 0.774759961,
   'tempo': 0.355111854,
   'key': 0.272727273,
   'valence': 0.448,
   'danceability': 

In [48]:
# Sanity check
l, dl = 0, 0
for x in song_data.values():
    if x['label'] == 1: l += 1
    else: dl += 1
print(' likes:', l, '\n', 'dislikes:', dl)

 likes: 1919 
 dislikes: 3790


In [40]:
genres

{'artist_genres': 0,
 'instrumentalness': 1,
 'acousticness': 2,
 'speechiness': 3,
 'energy': 4,
 'tempo': 5,
 'key': 6,
 'valence': 7,
 'danceability': 8,
 'uk contemporary r&b': 9,
 'future garage': 10,
 'house': 11,
 'nu disco': 12,
 'christmas': 13,
 'funk': 14,
 'motown': 15,
 'soul': 16,
 'dub': 17,
 'reggae': 18,
 'roots reggae': 19,
 'hindustani classical': 20,
 'indian classical': 21,
 'bass music': 22,
 'microhouse': 23,
 'indie jazz': 24,
 'dance pop': 25,
 'pop': 26,
 'chicago house': 27,
 'tech house': 28,
 'alternative dance': 29,
 'indietronica': 30,
 'new rave': 31,
 'afro house': 32,
 'filter house': 33,
 'drum and bass': 34,
 'drumfunk': 35,
 'jungle': 36,
 'neurofunk': 37,
 'uk garage': 38,
 'world fusion': 39,
 'deep house': 40,
 'minimal tech house': 41,
 'ambient': 42,
 'drone': 43,
 'electra': 44,
 'fluxwork': 45,
 'mandible': 46,
 'downtempo': 47,
 'electronic': 48,
 'vaporwave': 49,
 'wonky': 50,
 'acid idm': 51,
 'big beat': 52,
 'breakbeat': 53,
 'deep soul 

In [41]:
feature_names

('artist_popularity',
 'artist_followers',
 'instrumentalness',
 'duration_ms',
 'time_signature',
 'acousticness',
 'speechiness',
 'energy',
 'loudness',
 'tempo',
 'key',
 'valence',
 'danceability',
 'liveness')

In [42]:
class Classifier(object):
    def __init__(self, algorithm, train_data):
        '''
        description
        
        :param algorithm:   the name of the learning algorithm to use
        :param train_data:  training data, dict with following relevant keys: {
                                    features: { feature_name: feature_value},
                                    label: label_value
                            }
        
        :return             Classifier instance
        '''
        
        x_train, y_train = [x['features'] for x in train_data.values()], [x['label'] for x in train_data.values()]
     
        self.algorithm = algorithm
        
        if algorithm == 'svm':
            
            self.vectorizer = DictVectorizer(sort=True)
            
            self.svm = LinearSVC(penalty='l2', loss='hinge')
            self.svm.fit(self.vectorizer.fit_transform(x_train), y_train)
        
        else: raise ValueError('unsupported algorithm: ' + algorithm)
    
    def validate(self, validation_data):
        '''
        Predicts on all instances in validation_data. Returns accuracy.
        '''
        num_instances = len(validation_data)
        x_test, y_test = [x['features'] for x in validation_data.values()], [x['label'] for x in validation_data.values()]
        
        correct = 0
        for i in range(num_instances):
            if self.predict(x_test[i]) == y_test[i]: correct += 1
        
        return correct / num_instances
                
    def predict(self, x):
        if self.algorithm == 'svm': return self.predict_SVM(x)
        raise ValueError('not implemented')
    
    def predict_SVM(self, x):
        x = self.vectorizer.transform([x])
        return self.svm.predict(x)[0]
    

In [43]:
def filter_features(data, discard):
    '''
    Filters out features from data. Does not modify passed-in object (creates a copy).
    
    :param data:        dict of data, same format as song_data
    :param discard:     feature names to discard
                        
    :return             copy of data, with filtered features
    '''
    out = data.copy()
    
    for id_ in data:
        out[id_]['features'] = { k:v for k, v in out[id_]['features'].items() if k not in discard }
    
    return out

def split_data(data, p):
    '''
    Splits data into training and validation sets for simple classification.
    
    :param data:    complete labeled data
    :param p:       proportion of data to use for validation
                    
    :return         train_data, validation_data
    '''
    validation_labels = rand_sample(list(data), int(len(data) * p))
    
    validation_data = { k:data[k] for k in validation_labels }

    return {k:v for k, v in data.items() if k not in validation_data}, validation_data


In [44]:
def run_basic_svm(data, p, discard=None):
    '''
    Trains svm on song_data (with or without filtering features), splits into train/validation,
    and returns accuracy on validation data.
    '''
    if discard: data = filter_features(data, discard)
    
    train_data, validation_data = split_data(data, p)
    
    clf = Classifier('svm', train_data)
    
    return clf.validate(validation_data)

In [45]:
run_basic_svm(song_data, 0.2)



0.9316389132340053