In [20]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from sklearn.metrics import pairwise as pw

pd.options.mode.chained_assignment = None

In [21]:
# Compress the album df so that there are no duplicate entries
# for tracks in the same album
def compress_data(data, id_name):
    grouped_data = data.groupby('id')
    grouped_list_data = grouped_data['track_id'].apply(set)
    data_track_listed = pd.merge(data, grouped_list_data, on='id')
    data_track_listed = data_track_listed.drop(columns=['track_id_x'])
    data_track_compressed = data_track_listed.drop_duplicates(subset='id')
    data_track_compressed = data_track_compressed.rename(columns={'track_id_y': 'tracks_included'})
    data_track_compressed.insert(0, id_name, data_track_compressed['id'])
    data_track_compressed = data_track_compressed.drop(columns=['id'])
    return data_track_compressed

In [22]:
def output_df(data, columns, path):
    out = pd.DataFrame(data=data, columns=columns)
    out.to_csv(path_or_buf=path, index=False)

In [23]:
def track_add_artist_album_id(tracks, tracks_all):
    tracks_df = pd.merge(tracks, tracks_all['album']['id'], on='track_id')
    tracks_df = tracks_df.rename(columns={'id': 'album_id'})
    tracks_df = pd.merge(tracks_df, tracks_all['artist']['id'], on='track_id')
    tracks_df = tracks_df.rename(columns={'id': 'artist_id'})
    return tracks_df

In [24]:
def extract_genre_list(list_str):
    out = []
    elements = list_str[1:-1].split(',')
    for e in elements:
        if (e != ''):
            out.append(int(e))
    return out

In [25]:
# Problem: have to track until reach root; 1 column for each genre
def track_top_genre(target, genre_data):
    not_found = False
    cur_id = target
    while (not_found == False):
        row = genre_data[genre_data['genre_id'] == cur_id]        
        if row['parent'].iloc[0] == 0:
            return cur_id
        cur_id = row['parent'].iloc[0]
    return -1

In [26]:
# Transforming the top genre column in tracks to an index-based one
def track_form_genre_col(tracks, genres):
    top_genres_col = []
    for i in range(len(tracks)):
        cur_list = set()
        genre_list = extract_genre_list(tracks.iloc[i]['genres_all'])
        if len(genre_list) > 0:
            for j in genre_list:            
                root = track_top_genre(j, genres)
                cur_list.add(root)
        top_genres_col.append(cur_list)
    return top_genres_col

In [27]:
def setup_output_album(tracks_all):
    albums = tracks_all["album"]
    albums = albums.reset_index()
    albums_df = compress_data(albums, 'album_id')
    albums_columns = ['album_id', 'date_created', 'listens', 'favorites', 'type', 'tracks_included']
    albums_path = '/Users/mchuang/PycharmProjects/react-flask-app/flaskProject/data/albums_compressed.csv'
    output_df(albums_df, albums_columns, albums_path)

In [28]:
def setup_output_artist(tracks_all):
    artist = tracks_all["artist"]
    artist = artist.reset_index()
    artist_df = compress_data(artist, 'artist_id')
    artist_columns = ['artist_id', 'favorites', 'name']
    artist_path = '/Users/mchuang/PycharmProjects/react-flask-app/flaskProject/data/artist_compressed.csv'
    output_df(artist_df, artist_columns, artist_path)

In [29]:
def setup_output_track(tracks_all):
    tracks = tracks_all["track"]
    tracks = tracks.reset_index()
    tracks = track_add_artist_album_id(tracks, tracks_all)
    tracks['genre_id'] = track_form_genre_col(tracks, genres)
    tracks_columns = ['track_id', 'genre_id', 'album_id', 'artist_id', 'date_created', 'favorites', 'interest', 'listens', 'title']
    tracks_df = pd.DataFrame(data=tracks, columns=tracks_columns)
    # Store table to db
    tracks_df.to_csv(path_or_buf='/Users/mchuang/PycharmProjects/react-flask-app/flaskProject/data/tracks_compressed.csv', index=False)

In [30]:
def setup_output_genres(genres):
    genres = genres.reset_index()
    genres_df = genres[genres['parent'] == 0]
    genres_columns = ['genre_id', 'title']
    genres_path = '/Users/mchuang/PycharmProjects/react-flask-app/flaskProject/data/genres_compressed.csv'
    output_df(genres_df, genres_columns, genres_path)

In [31]:
# Read in the data
tracks_all = pd.read_csv('data/fma_metadata/tracks.csv', index_col=0, header=[0, 1])
tracks_small = tracks_all[tracks_all['set']['subset'] == 'small']
genres = pd.read_csv('data/fma_metadata/genres.csv')
echonest = pd.read_csv('data/fma_metadata/echonest.csv',index_col=0, header=[0, 1, 2])
features = pd.read_csv('data/fma_metadata/features.csv',index_col=0, header=[0, 1, 2])

In [32]:
setup_output_album(tracks_small)
setup_output_artist(tracks_small)
setup_output_genres(genres)
setup_output_track(tracks_small)

In [None]:
# For analysis
# Expression determining whether a row is valid(no na values)
album_valid = ((albums_df_with_na['album_id']>=0) & ~(albums_df_with_na['date_created'].isna()) &
                (albums_df_with_na['listens']>=0) & (albums_df_with_na['favorites']>=0) &
                ~(albums_df_with_na['type'].isna()) & ~(albums_df_with_na['tracks_included'].isna()))
# Keep only rows with valid values; 6.2% of the data dropped
album_df_no_na = albums_df_with_na[album_valid == True]

In [None]:
# Transform dummy types for album['type']
album_type_dummy = pd.get_dummies(album_df_no_na['type'])
for c in album_type_dummy.columns:
    album_df_no_na[c] = album_type_dummy[c]

In [None]:
# Transform timestamp for album['date_created']
album_df_no_na['timestamp'] = [datetime.datetime.timestamp(datetime.datetime.strptime(album_df_no_na.iloc[i]['date_created'],"%Y-%m-%d %H:%M:%S")) for i in range(len(album_df_no_na))]

In [None]:
# Store table to db
tracks_df.to_csv(path_or_buf='/Users/mchuang/PycharmProjects/flaskProject/data/tracks_compressed.csv', index=False)

In [None]:
# Compress the album df so that there are no duplicate entries
# for tracks in the same album
artists_with_track_id = artists.reset_index()
grouped_artists = artists_with_track_id.groupby('id')
grouped_list_artists = grouped_artists['track_id'].apply(list)
artists_track_listed = pd.merge(artists_with_track_id, grouped_list_artists, on='id')
artists_track_listed = artists_track_listed.drop(columns=['track_id_x'])
artists_track_compressed = artists_track_listed.drop_duplicates(subset='id')
artists_track_compressed = artists_track_compressed.rename(columns={'track_id_y': 'tracks_included'})

In [None]:
artists_track_compressed

In [None]:
# Dataframe to store in db
artists_track_compressed.insert(0, 'artist_id', artists_track_compressed['id'])
artists_track_compressed = artists_track_compressed.drop(columns=['id'])
artist_columns = ['artist_id', 'favorites', 'name']
artist_final = pd.DataFrame(data=artists_track_compressed, columns=artist_columns)

In [None]:
artist_final.to_csv(path_or_buf='/Users/mchuang/PycharmProjects/flaskProject/data/artist_compressed.csv', index=False)

In [None]:
# Transforming the top genre column in tracks to an index-based one
top_genres_sep = genres[genres['parent']==0].loc[:,['genre_id','title']].to_dict('records')

top_genres_dict = {}
for i in top_genres_sep:
    top_genres_dict[i['title']] = i['genre_id']
    
top_genres_col = []
for i in range(len(tracks_df)):
    cur = tracks_df.iloc[i]['genre_top']
    if (cur in top_genres_dict):
        top_genres_col.append(top_genres_dict[cur])
    else:
        top_genres_col.append(-1)
tracks_df['genre_id'] = top_genres_col

In [None]:
tracks = pd.read_csv('/Users/mchuang/PycharmProjects/react-flask-app/flaskProject/data/tracks_compressed.csv', index_col=0, header=[0, 1])
genres = pd.read_csv('/Users/mchuang/PycharmProjects/react-flask-app/flaskProject/data/genres_compressed.csv')
echonest = pd.read_csv('data/fma_metadata/echonest.csv',index_col=0, header=[0, 1, 2])
features = pd.read_csv('data/fma_metadata/features.csv',index_col=0, header=[0, 1, 2])

In [34]:
def read_data():
    whole_track = pd.read_csv('/Users/mchuang/PycharmProjects/MOJO_Project/data/fma_metadata/tracks.csv', index_col=0, header=[0, 1])
    genre = pd.read_csv('/Users/mchuang/PycharmProjects/MOJO_Project/data/fma_metadata/genres.csv')
    echonest = pd.read_csv('/Users/mchuang/PycharmProjects/MOJO_Project/data/fma_metadata/echonest.csv', index_col=0, header=[0, 1, 2])
    small_track = whole_track[whole_track['set']['subset'] == 'small']
    return small_track, genre, echonest


def split_track_levels(whole_track):
    album = whole_track['album']
    artist = whole_track['artist']
    track = whole_track['track']
    track.reset_index(level=0, inplace=True)
    album.reset_index(level=0, inplace=True)
    artist.reset_index(level=0, inplace=True)
    return track, album, artist


def split_echonest_levels(echonest):
    audio_features = echonest['echonest']['audio_features']
    metadata = echonest['echonest']['metadata']
    ranks = echonest['echonest']['ranks']
    social_features = echonest['echonest']['social_features']
    temporal_features = echonest['echonest']['temporal_features']
    audio_features.reset_index(level=0, inplace=True)
    metadata.reset_index(level=0, inplace=True)
    ranks.reset_index(level=0, inplace=True)
    social_features.reset_index(level=0, inplace=True)
    temporal_features.reset_index(level=0, inplace=True)
    return audio_features, metadata, ranks, social_features, temporal_features


In [35]:
def extract_tracks(tracks):
    # Set up subset of track dataset
    track_simplified = pd.DataFrame(
        {'track_comments': tracks[('track', 'comments')],
         'track_favorites': tracks[('track', 'favorites')],
         'track_genre': tracks[('track', 'genres_all')],
         'track_interest': tracks[('track', 'interest')],
         'track_listen': tracks[('track', 'listens')]})
    return track_simplified


def extract_echonest(echonest):
    # Set up subset of echonest dataset
    echonest_no_level = echonest.copy()
    no_level_columns = echonest_no_level.columns.droplevel(0).droplevel(0)
    echonest_no_level.columns = no_level_columns
    echonest_no_level = pd.DataFrame(echonest_no_level.iloc[:, :25])

    audio_columns = 'audio_' + no_level_columns[:8]
    metadata_columns = 'metadata_' + no_level_columns[8:15]
    ranks_columns = 'ranks_' + no_level_columns[15:20]
    social_columns = 'social_' + no_level_columns[20:25]
    echonest_no_level.columns = audio_columns.append(metadata_columns).append(ranks_columns).append(social_columns)
    echonest_simplified = pd.merge(echonest_no_level.iloc[:, :8], echonest_no_level.iloc[:, 20:], on='track_id')
    return echonest_simplified


def extract_genre_list(list_str):
    out = []
    elements = list_str[1:-1].split(',')
    for e in elements:
        out.append(int(e))
    return out


def add_genre_columns(data, genres):
    for g in genres['title']:
        code = genres[genres['title'] == g]['genre_id']
        genre_col = np.zeros(len(data))
        for i in range(len(data)):
            genre_list = extract_genre_list(data.iloc[i]['track_genre'])
            for element in genre_list:
                if int(element) == int(code):
                    genre_col[i] = 1
        data[g] = genre_col
    return data


def setup_data(tracks, genres, echonest):
    track_simplified = extract_tracks(tracks)
    echonest_simplified = extract_echonest(echonest)
    data = pd.merge(track_simplified, echonest_simplified, on='track_id')
    data.drop_duplicates()
    data = add_genre_columns(data, genres)
    data = data.drop(labels=['track_genre'], axis=1)
    return data


def similarity_df(data):
    sim_list = []
    cosine_similarities = pw.cosine_similarity(data, data)
    for i in range(len(cosine_similarities) - 1):
        for j in range(i+1, len(cosine_similarities)):
            sim_list.append((data.index[i], data.index[j], cosine_similarities[i][j]))
    sim_table = pd.DataFrame(sim_list, columns=["track_a", "track_b", "score"])
    return sim_table

def output_sim_table(tracks, genres, echonest):
    data = setup_data(tracks, genres, echonest)
    sim = similarity_df(data)
    sim.to_csv(path_or_buf='/Users/mchuang/PycharmProjects/react-flask-app/flaskProject/data/similarity.csv', index=False)


In [36]:
track, genre, echonest = read_data()
output_sim_table(track, genre, echonest)