This is where the metadata, feature space, and genres are read in and stored in the "songs" dictionary by `track_id`.

In [1]:
def read_data(file):
    
    songs = {}

    with open(file, 'r') as f:
    
        for song in f:
            
            line = song.split(',')
            
            if line[0]:
            
                label = 1 if file == 'likes.csv' else 0

                metadata, data = {}, {}
                genres = []
                features = [label, line[4], line[5]]
                track_id = line[1]

                metadata['artist_id'] = line[2]
                metadata['artist_name'] = line[3]

                if line[6]:
                    genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                count = 0

                for i in range(7, len(line)):
                    if '"' in line[i]:
                        genres.append(line[i][:-1])
                        count = i+1
                        break
                    if line[i].isalpha():
                        genres.append(line[i])
                        
                if count == 0:
                    for i in range(7, len(line)):
                        features.append(line[i])
                else:
                    for i in range(count, len(line)):
                        features.append(line[i])

                features[-1] = features[-1][:-1]

                data['metadata'] = metadata
                data['features'] = features
                data['genres'] = genres

                songs[track_id] = data

    return songs

This is where genres are read in, with each genre getting a unique `genre_id`.

In [2]:
def read_genres(files):
    
    genre_mapping = {}
    genre_id = 0
    
    for file in files:

        with open(file, 'r') as f:

            for song in f:

                line = song.split(',')

                if line[0]:

                    genres = []

                    if line[6]:
                        genres.append(line[6][1:]) if '"' in line[6] else genres.append(line[6])

                    count = 0

                    for i in range(7, len(line)):
                        if '"' in line[i]:
                            genres.append(line[i][:-1])
                            count = i+1
                            break
                        if line[i].isalpha():
                            genres.append(line[i])
                    
                    for genre in genres:
                        if (genre not in genre_mapping) and ('"' not in genre) and (not genre.isdigit()):
                            genre_mapping[genre] = genre_id
                            genre_id += 1
        
    return genre_mapping

This is where the data is actually read in. We read in the `likes.csv` + `dislikes.csv` data, and merge them into one dictionary, `song_data`.

In [3]:
likes_data = read_data('likes.csv')
dislikes_data = read_data('dislikes.csv')
song_data = {**likes_data, **dislikes_data}


In [4]:
genres = read_genres(['likes.csv', 'dislikes.csv'])

In [5]:
song_data

{'3tLcZK0pGyOEKMsDBV8wgb': {'features': [0,
   '53',
   '17865',
   '0.0',
   '599141',
   '4',
   '0.107',
   '0.034',
   '0.666',
   '-6.189',
   '138.005',
   '0',
   '0.0955',
   '0.364',
   '0.177'],
  'genres': ['anthem worship', 'world worship'],
  'metadata': {'artist_id': '107CG0UhUl9GJnPwF83N63',
   'artist_name': 'UPPERROOM'}},
 '626zLwL6DzYNUDcIJAfrM2': {'features': [0,
   '75',
   '2787856',
   '0.0',
   '165560',
   '4',
   '0.0281',
   '0.119',
   '0.877',
   '-6.019',
   '99.494',
   '2',
   '0.706',
   '0.612',
   '0.233'],
  'genres': ['australian country', 'country', 'country road'],
  'metadata': {'artist_id': '0u2FHSq3ln94y5Q57xazwf',
   'artist_name': 'Keith Urban'}},
 '6rHywFitLCs0kkdCVcZTkM': {'features': [1,
   '7',
   '358',
   '0.438',
   '417142',
   '4',
   '0.0632',
   '0.0508',
   '0.56',
   '-10.13',
   '130.004',
   '7',
   '0.0343',
   '0.679',
   '0.113'],
  'genres': [],
  'metadata': {'artist_id': '2KiSWHPj9ezMfSVdFUbXaZ',
   'artist_name': 'Gemini 

In [6]:
genres

{'abstract': 237,
 'abstract beats': 123,
 'abstract hip hop': 431,
 'acid house': 118,
 'acid idm': 42,
 'acid jazz': 102,
 'acid techno': 194,
 'acoustic pop': 388,
 'adoracao': 398,
 'adult standards': 58,
 'afro house': 23,
 'afrobeat': 106,
 'afropop': 107,
 'alaska indie': 339,
 'alberta country': 338,
 'album rock': 164,
 'alternative ccm': 401,
 'alternative country': 327,
 'alternative dance': 20,
 'alternative hip hop': 83,
 'alternative metal': 227,
 'alternative pop': 267,
 'alternative rock': 94,
 'ambient': 33,
 'ambient fusion': 359,
 'ambient idm': 262,
 'anthem worship': 381,
 'art pop': 66,
 'aussietronica': 49,
 'austindie': 119,
 'australian alternative rock': 161,
 'australian country': 315,
 'australian dance': 177,
 'australian indie': 91,
 'australian indigenous': 336,
 'australian pop': 306,
 'australian rock': 337,
 'baile pop': 481,
 'balearic': 64,
 'ballet class': 154,
 'bass music': 13,
 'bass trap': 241,
 'bass trip': 250,
 'bassline': 182,
 'battle rap':