In [6]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise as pw
import json

In [7]:
# Read in the data
tracks = pd.read_csv('data/fma_metadata/tracks.csv', index_col=0, header=[0, 1])
genres = pd.read_csv('data/fma_metadata/genres.csv')
echonest = pd.read_csv('data/fma_metadata/echonest.csv',index_col=0, header=[0, 1, 2])
features = pd.read_csv('data/fma_metadata/features.csv',index_col=0, header=[0, 1, 2])

In [9]:
print (tracks.columns)
print (genres.columns)
print (echonest.columns)
print (features.columns)

MultiIndex([( 'album',          'comments'),
            ( 'album',      'date_created'),
            ( 'album',     'date_released'),
            ( 'album',          'engineer'),
            ( 'album',         'favorites'),
            ( 'album',                'id'),
            ( 'album',       'information'),
            ( 'album',           'listens'),
            ( 'album',          'producer'),
            ( 'album',              'tags'),
            ( 'album',             'title'),
            ( 'album',            'tracks'),
            ( 'album',              'type'),
            ('artist', 'active_year_begin'),
            ('artist',   'active_year_end'),
            ('artist', 'associated_labels'),
            ('artist',               'bio'),
            ('artist',          'comments'),
            ('artist',      'date_created'),
            ('artist',         'favorites'),
            ('artist',                'id'),
            ('artist',          'latitude'),
          

In [3]:
# Set up subset of track dataset
track_simplified = pd.DataFrame(
    {'track_comments': tracks[('track', 'comments')],
     'track_favorites': tracks[('track', 'favorites')],
     'track_genre': tracks[('track', 'genres_all')],
     'track_interest': tracks[('track', 'interest')],
     'track_listen': tracks[('track', 'listens')]})

In [4]:
# Set up subset of echonest dataset
echonest_no_level = echonest.copy()
no_level_columns = echonest_no_level.columns.droplevel(0).droplevel(0)
echonest_no_level.columns = no_level_columns
echonest_no_level = pd.DataFrame(echonest_no_level.iloc[:, :25])

audio_columns = 'audio_' + no_level_columns[:8]
metadata_columns = 'metadata_' + no_level_columns[8:15]
ranks_columns = 'ranks_' + no_level_columns[15:20]
social_columns = 'social_' + no_level_columns[20:25]
echonest_no_level.columns = audio_columns.append(metadata_columns).append(ranks_columns).append(social_columns)
echonest_simplified = pd.merge(echonest_no_level.iloc[:, :8], echonest_no_level.iloc[:, 20:], on='track_id')

In [5]:
# Merge subsets together
data = pd.merge(track_simplified, echonest_simplified, on='track_id')
# Remove duplicate
data.drop_duplicates()

Unnamed: 0_level_0,track_comments,track_favorites,track_genre,track_interest,track_listen,audio_acousticness,audio_danceability,audio_energy,audio_instrumentalness,audio_liveness,audio_speechiness,audio_tempo,audio_valence,social_artist_discovery,social_artist_familiarity,social_artist_hotttnesss,social_song_currency,social_song_hotttnesss
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,0,2,[21],4656,1293,0.416675,0.675894,0.634476,0.010628,0.177647,0.159310,165.922,0.576661,0.388990,0.386740,0.406370,0.000000,0.000000
3,0,1,[21],1470,514,0.374408,0.528643,0.817461,0.001851,0.105880,0.461818,126.957,0.269240,0.388990,0.386740,0.406370,0.000000,0.000000
5,0,6,[21],1933,1151,0.043567,0.745566,0.701470,0.000697,0.373143,0.124595,100.260,0.621661,0.388990,0.386740,0.406370,0.000000,0.000000
10,0,178,[10],54881,50135,0.951670,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.963590,0.557339,0.614272,0.798387,0.005158,0.354516
134,0,3,[21],1126,943,0.452217,0.513238,0.560410,0.019443,0.096567,0.525519,114.290,0.894072,0.388990,0.386740,0.406370,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124857,0,11,"[542, 21, 286, 15]",20996,13365,0.007592,0.790364,0.719288,0.853114,0.720715,0.082550,141.332,0.890461,0.430808,0.456871,0.486749,0.000000,0.000000
124862,0,3,"[542, 21, 286, 15]",4501,3588,0.041498,0.843077,0.536496,0.865151,0.547949,0.074001,101.975,0.476845,0.430808,0.456871,0.486749,0.000000,0.000000
124863,0,2,"[542, 21, 286, 15]",3641,2956,0.000124,0.609686,0.895136,0.846624,0.632903,0.051517,129.996,0.496667,0.430808,0.456871,0.486749,0.000000,0.000000
124864,0,5,"[542, 21, 286, 15]",3960,3126,0.327576,0.574426,0.548327,0.452867,0.075928,0.033388,142.009,0.569274,0.430808,0.456871,0.486749,0.000000,0.000000


In [6]:
def extract_genre_list(list_str):
    out = []
    elements = list_str[1:-1].split(',')
    for e in elements:
        out.append(int(e))
    return out

In [None]:
# Set up genre columns
for g in genres['title']:
    code = genres[genres['title']==g]['genre_id']
    genre_col = np.zeros(len(data))
    for i in range(len(data)):
        genre_list = extract_genre_list(data.iloc[i]['track_genre'])
        for element in genre_list:
            if (int(element) == int(code)):
                genre_col[i] = 1
    data[g] = genre_col

In [None]:
data = data.drop(labels=['track_genre'], axis=1)

In [None]:
cosine_similarities = pw.cosine_similarity(data, data)
top_five_dict = {}
for i in range(len(data)):
    all_similarities = cosine_similarities[i]
    max_index = sorted(range(len(all_similarities)), key=lambda j: all_similarities[j])[-6:]
    max_index.remove(i)
    top_five = []
    for k in max_index:
        top_five.append((data.index[k], all_similarities[k]))
    top_five_dict[data.index[i]] = top_five


In [None]:
def genre_id_to_index(target_id):
    for i in range(len(genres)):
        if genres.iloc[i]['genre_id'] == target_id:
            return i
    return -1

In [None]:
def setup_genre_list(selected_genre):
    out = np.zeros(len(genres))
    for g in selected_genre:
        index = genre_id_to_index(g)
        if (index != -1):
            out[index] = 1
    return out

In [None]:
# First Round: Genre only(input), calculate similariy matrix by genre, recommand 5
# Second Round: Several pieces of music(input), find max values in matrix, recommand 3
# Last Round: ... continue until one piece of music as in
# Input as JSON(fields: track_list, genre_list)
def recommend_music(user_input, round_nb):
    input_dict = json.loads(user_input)
    out = set()
    if (round_nb == 0 or len(input_dict["track_list"]) == 0):
        genre_list = setup_genre_list(input_dict["genre_list"])
        similarity_table = pw.cosine_similarity([genre_list], data.iloc[:, 17:])
        max_index = sorted(range(len(similarity_table)), key=lambda j: similarity_table[j])[-5:]
        for k in max_index:
            out.add(data.index[k])
    else:
        
    return out
        

In [None]:
test_json = json.dumps({'track_list':[], 'genre_list':[2,3,4,5,1032]})
# json.loads(test_json)
recommend_music(test_json,0)

In [None]:
data.columns[]

In [None]:
genres.sort_values(by='#tracks', ascending=False).head(10)

In [None]:
def extract_tracks(tracks):
    # Set up subset of track dataset
    track_simplified = pd.DataFrame(
        {'track_comments': tracks[('track', 'comments')],
         'track_favorites': tracks[('track', 'favorites')],
         'track_genre': tracks[('track', 'genres_all')],
         'track_interest': tracks[('track', 'interest')],
         'track_listen': tracks[('track', 'listens')]})
    return track_simplified


def extract_echonest(echonest):
    # Set up subset of echonest dataset
    echonest_no_level = echonest.copy()
    no_level_columns = echonest_no_level.columns.droplevel(0).droplevel(0)
    echonest_no_level.columns = no_level_columns
    echonest_no_level = pd.DataFrame(echonest_no_level.iloc[:, :25])

    audio_columns = 'audio_' + no_level_columns[:8]
    metadata_columns = 'metadata_' + no_level_columns[8:15]
    ranks_columns = 'ranks_' + no_level_columns[15:20]
    social_columns = 'social_' + no_level_columns[20:25]
    echonest_no_level.columns = audio_columns.append(metadata_columns).append(ranks_columns).append(social_columns)
    echonest_simplified = pd.merge(echonest_no_level.iloc[:, :8], echonest_no_level.iloc[:, 20:], on='track_id')
    return echonest_simplified


def extract_genre_list(list_str):
    out = []
    elements = list_str[1:-1].split(',')
    for e in elements:
        out.append(int(e))
    return out

In [None]:
def add_genre_columns(data, genres):
    for g in genres['title']:
        code = genres[genres['title'] == g]['genre_id']
        genre_col = np.zeros(len(data))
        for i in range(len(data)):
            genre_list = extract_genre_list(data.iloc[i]['track_genre'])
            for element in genre_list:
                if int(element) == int(code):
                    genre_col[i] = 1
        data[g] = genre_col
        print(data[g].head(1))
    return data

def setup_data(tracks, genres, echonest):
    track_simplified = extract_tracks(tracks)
    echonest_simplified = extract_echonest(echonest)
    data = pd.merge(track_simplified, echonest_simplified, on='track_id')
    data.drop_duplicates()
    data = add_genre_columns(data, genres)
    data.drop(labels=['track_genre'], axis=1)
    return data

In [None]:
# Read in the data used for kmeans
tracks = pd.read_csv('data/fma_metadata/tracks.csv', index_col=0, header=[0, 1])
genres = pd.read_csv('data/fma_metadata/genres.csv')
echonest = pd.read_csv('data/fma_metadata/echonest.csv',index_col=0, header=[0, 1, 2])

In [None]:
data = setup_data(tracks, genres, echonest)
print(data.head())
sim = similarity_df(data)
print(sim.head())

In [10]:
data.head()

Unnamed: 0_level_0,track_comments,track_favorites,track_genre,track_interest,track_listen,audio_acousticness,audio_danceability,audio_energy,audio_instrumentalness,audio_liveness,audio_speechiness,audio_tempo,audio_valence,social_artist_discovery,social_artist_familiarity,social_artist_hotttnesss,social_song_currency,social_song_hotttnesss
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2,0,2,[21],4656,1293,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,0.38899,0.38674,0.40637,0.0,0.0
3,0,1,[21],1470,514,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,0.38899,0.38674,0.40637,0.0,0.0
5,0,6,[21],1933,1151,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,0.38899,0.38674,0.40637,0.0,0.0
10,0,178,[10],54881,50135,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359,0.557339,0.614272,0.798387,0.005158,0.354516
134,0,3,[21],1126,943,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,0.38899,0.38674,0.40637,0.0,0.0


In [9]:
set_data = tracks['set']['subset']
set_data.head()

Unnamed: 0_level_0,split,subset
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,training,small
3,training,medium
5,training,small
10,training,small
20,training,large


In [17]:
tracks[tracks['set']['subset']=='small']

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
5,0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
140,1,2008-11-26 01:49:59,2007-05-22 00:00:00,,1,61,<p>Alec K. Redfearn &amp; The Eyesores: Ellen ...,1300,"Alec K. Refearn, Rob Pemberton",[],...,,1593,en,Attribution-Noncommercial-No Derivative Works ...,1299,,2,,[],Queen Of The Wires
141,0,2008-11-26 01:49:57,2009-01-16 00:00:00,,1,60,"<p>A full ensamble of strings, drums, electron...",1304,,[],...,,839,en,Attribution-Noncommercial-No Derivative Works ...,725,,4,,[],Ohio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154308,0,2017-03-05 04:57:38,2017-03-05 00:00:00,,0,22780,,22334,Fleslit,"['fleslit', 'trap beat free use', 'trap beat f...",...,,3371,,Attribution,2705,,17,,"['fleslit', 'trap beat free use', 'trap beat f...",MIA
154309,0,2017-03-05 04:57:38,2017-03-05 00:00:00,,0,22780,,22334,Fleslit,"['fleslit', 'trap beat free use', 'trap beat f...",...,,4525,,Attribution,3589,,18,,"['fleslit', 'trap beat free use', 'trap beat f...",A1 Symphony
154413,0,2017-03-07 18:44:11,,Ernie Indradat,0,22789,<p>A live performance at WFMU for Dark Night o...,3777,Julie Bennack,[],...,,809,,Creative Commons Attribution-NonCommercial-NoD...,676,,9,,[],Do Easy
154414,0,2017-03-07 18:44:11,,Ernie Indradat,0,22789,<p>A live performance at WFMU for Dark Night o...,3777,Julie Bennack,[],...,,851,,Creative Commons Attribution-NonCommercial-NoD...,788,,10,,[],Dead Can Dance (uncensored)
