This is where the metadata, feature space, and genres are read in and stored in the "songs" dictionary by `track_id`.

In [1]:
def read_data(file):
    
    songs = {}

    with open(file, 'r') as f:

        for song in f:
            
            line = song.split(',')
            
            label = line[0]
                        
            if line[0] != 'label' and line[0]:
                metadata, data = {}, {}
                genres = []
                track_id = line[1]
                
                features = {}
                features['label'] = label
                features['artist_popularity'] = line[4]
                features['artist_followers'] = line[5]

                metadata['artist_id'] = line[2]
                metadata['artist_name'] = line[3]

                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0

                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])

                if count == 0:
                    features['instrumentalness'] = float(line[7])
                    features['duration_ms'] = float(line[8])
                    features['time_signature'] = float(line[9])
                    features['acousticness'] = float(line[10])
                    features['speechiness'] = float(line[11])
                    features['energy'] = float(line[12])
                    features['loudness'] = float(line[13])
                    features['tempo'] = float(line[14])
                    features['key'] = float(line[15])
                    features['valence'] = float(line[16])
                    features['danceability'] = float(line[17])
                    features['liveness'] = float(line[18][:-1])
                    
                else:
                    features['instrumentalness'] = float(line[count])
                    features['duration_ms'] = float(line[count+1])
                    features['time_signature'] = float(line[count+2])
                    features['acousticness'] = float(line[count+3])
                    features['speechiness'] = float(line[count+4])
                    features['energy'] = float(line[count+5])
                    features['loudness'] = float(line[count+6])
                    features['tempo'] = float(line[count+7])
                    features['key'] = float(line[count+8])
                    features['valence'] = float(line[count+9])
                    features['danceability'] = float(line[count+10])
                    features['liveness'] = float(line[count+11][:-1])
                
                data['metadata'] = metadata
                data['features'] = features
                data['genres'] = genres
                data['label'] = label

                songs[track_id] = data

    return songs

This is where genres are read in, with each genre getting a unique `genre_id`.

In [2]:
def isFloat(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

def read_genres(file):
    
    genre_mapping = {}
    genre_id = 0
    
    with open(file, 'r') as f:

        for song in f:

            line = song.split(',')

            if line[0]:

                genres = []

                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0

                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])

                for genre in genres:
                    if (genre not in genre_mapping) and ('"' not in genre) and (not isFloat(genre)):
                        genre_mapping[genre] = genre_id
                        genre_id += 1
        
    return genre_mapping

This is where the data is actually read in. We read in the `likes.csv` + `dislikes.csv` data, and merge them into one dictionary, `song_data`.

In [3]:
song_data = read_data('data.csv')


In [4]:
genres = read_genres('data.csv')

In [5]:
song_data

{'2WjnOzI9E6Ra09vRWkuhpU': {'features': {'acousticness': 0.0251,
   'artist_followers': '0.000787',
   'artist_popularity': '0.25',
   'danceability': 0.75,
   'duration_ms': 0.089144798,
   'energy': 0.793,
   'instrumentalness': 0.000266,
   'key': 0.363636364,
   'label': '1',
   'liveness': 0.0564,
   'loudness': 0.754044757,
   'speechiness': 0.54,
   'tempo': 0.545459917,
   'time_signature': 0.8,
   'valence': 0.465},
  'genres': [],
  'label': '1',
  'metadata': {'artist_id': '0ymdoOsfzRbCoAMfJPpsEx',
   'artist_name': 'Cody Currie'}},
 '39AfBrgO2QI7oDtZvIht6M': {'features': {'acousticness': 0.000249,
   'artist_followers': '0.000325',
   'artist_popularity': '0.2',
   'danceability': 0.703,
   'duration_ms': 0.085371879,
   'energy': 0.79,
   'instrumentalness': 0.749,
   'key': 1.0,
   'label': '1',
   'liveness': 0.222,
   'loudness': 0.891736599,
   'speechiness': 0.0468,
   'tempo': 0.576407385,
   'time_signature': 0.8,
   'valence': 0.326},
  'genres': [],
  'label': '1'

In [6]:
genres

{'abstract': 246,
 'abstract beats': 132,
 'abstract hip hop': 440,
 'acid house': 127,
 'acid idm': 51,
 'acid jazz': 111,
 'acid techno': 203,
 'acoustic pop': 397,
 'acousticness': 2,
 'adoracao': 407,
 'adult standards': 67,
 'afro house': 32,
 'afrobeat': 115,
 'afropop': 116,
 'alaska indie': 348,
 'alberta country': 347,
 'album rock': 173,
 'alternative ccm': 410,
 'alternative country': 336,
 'alternative dance': 29,
 'alternative hip hop': 92,
 'alternative metal': 236,
 'alternative pop': 276,
 'alternative rock': 103,
 'ambient': 42,
 'ambient fusion': 368,
 'ambient idm': 271,
 'anthem worship': 390,
 'art pop': 75,
 'artist_genres': 0,
 'aussietronica': 58,
 'austindie': 128,
 'australian alternative rock': 170,
 'australian country': 324,
 'australian dance': 186,
 'australian indie': 100,
 'australian indigenous': 345,
 'australian pop': 315,
 'australian rock': 346,
 'baile pop': 490,
 'balearic': 73,
 'ballet class': 163,
 'bass music': 22,
 'bass trap': 250,
 'bass t