## Building data sets 

In this notebook, we assemble the `all_tracks.csv` and `all_tracks_with_labels.csv` data sets.

In [1]:
import pandas as pd
import numpy as np
import os, itertools
from analysis_utils import get_all_entries

### all tracks data set

In [2]:
artists = pd.read_csv('../spotify_data/all_unique_artist.csv')
artist_genders = dict(zip(artists['artist_id'], artists['gender']))

In [3]:
def get_entries(curation, index_cols, artist_genders):
    ''' same as get_all_entries only extended to include playlist name and artist gender '''
    rootdir = '../spotify_data/playlist_tracks/%s/' % curation
    
    dfs_all = []
    for playlist in os.listdir(rootdir):
        if not os.path.isdir(rootdir + '/' + playlist): continue
        
        for file in os.listdir(rootdir + '/' + playlist):           
            filepath = rootdir + '/' + playlist + '/' + file
            if os.path.isdir(filepath): continue
            df = pd.read_csv(filepath)[index_cols]
            df['playlist_name'] = playlist
            df['gender'] = [artist_genders[artist_id] for artist_id in df.artist_id]
            dfs_all.append(df)

    return pd.concat(dfs_all)

In [4]:
cols = pd.read_csv('../spotify_data/playlist_tracks/spotify_curated/Classic_Road_Trip_Songs/2019-06-16.csv').columns

In [5]:
all_spotify_curated = get_entries('spotify_curated', artist_genders=artist_genders, index_cols=cols)

In [6]:
all_user_curated = get_entries('user_curated', artist_genders=artist_genders, index_cols=cols)

In [7]:
len(all_spotify_curated), len(all_user_curated)

(103189, 109010)

In [8]:
all_spotify_curated['playlist_type'] = 's'
all_user_curated['playlist_type'] = 'u'

In [9]:
all_tracks = pd.concat([all_spotify_curated, all_user_curated])

In [10]:
assert len(all_tracks) == len(pd.concat([get_all_entries('spotify_curated', ['track_id']), get_all_entries('user_curated', ['track_id'])]))

In [197]:
all_tracks.to_csv('../spotify_data/all_tracks.csv', index=False)

### label data set

since we count tracks associated with multiple labels as multiple seperate label appearances, we need to make a different data set for record label analysis

In [180]:
tracks = pd.read_csv('../spotify_data/all_unique_track_with_labels.csv')

In [181]:
unique_tracks = tracks.drop_duplicates(subset=['track_id'], keep='first')
# tracks that are duplicated
duplicates = tracks[tracks.duplicated(subset=['track_id'], keep='first')]

In [182]:
track_labels = dict(zip(unique_tracks['track_id'], unique_tracks['album_label']))

In [183]:
for track_id, second_label in zip(duplicates.track_id, duplicates.album_label):
    cur_label = track_labels[track_id]
    if isinstance(cur_label, str):
        track_labels[track_id] = [cur_label, second_label]
    elif not isinstance(cur_label, float):
        # if not np.nan and not a string, we append to the list 
        cur_label.append(second_label)
        track_labels[track_id] = cur_label

In [184]:
# track with three labels
track_labels['7gl2cxHUqmZm1gbva9fLku']

['Resilience Records', 'Nothing Else Matters', 'RCA']

In [185]:
assert len(track_labels) == len(unique_tracks)

In [162]:
def get_all_label_entries(curation, track_labels):    
    rootdir = '../spotify_data/playlist_tracks/%s/' % curation
    
    tracks = []
    for playlist in os.listdir(rootdir):
        if not os.path.isdir(rootdir + '/' + playlist): continue
        for file in os.listdir(rootdir + '/' + playlist):           
            filepath = rootdir + '/' + playlist + '/' + file
            if os.path.isdir(filepath): continue
                
            track_ids = pd.read_csv(filepath)['track_id']
            for track in track_ids:
                try:
                    label = track_labels[track]
                except KeyError:
                    label = np.nan
                if isinstance(label, list):
                    # if label is list then there are multiple labels
                    # track with multiple labels => multiple appearances
                    tracks.extend([[track, playlist, l] for l in label])
                else:
                    tracks.append([track, playlist, label])

    return pd.DataFrame(tracks, columns=['track_id', 'playlist_name', 'label'])

In [163]:
spotify_labels = get_all_label_entries('spotify_curated', track_labels)

In [164]:
user_labels = get_all_label_entries('user_curated', track_labels)

In [165]:
spotify_labels['playlist_type'] = 's'
user_labels['playlist_type'] = 'u'

In [166]:
all_labels = pd.concat([spotify_labels, user_labels])

In [168]:
all_labels

Unnamed: 0,track_id,playlist_name,label,playlist_type
0,2zvot9pY2FNl1E94kc4K8M,Classic_Road_Trip_Songs,Mercury Records Limited,s
1,45yEy5WJywhJ3sDI28ajTm,Classic_Road_Trip_Songs,EMI Catalogue,s
2,0FeCO85RKW8fDRytwXof2x,Classic_Road_Trip_Songs,Now! Music,s
3,1XDgeeNIbwXobo7EkcLa2u,Classic_Road_Trip_Songs,Universal Music Spain S.L.,s
4,7Ar4G7Ci11gpt6sfH9Cgz5,Classic_Road_Trip_Songs,Rhino,s
...,...,...,...,...
122014,7gl2cxHUqmZm1gbva9fLku,Running_Workout_Goals,Resilience Records,u
122015,7gl2cxHUqmZm1gbva9fLku,Running_Workout_Goals,Nothing Else Matters,u
122016,7gl2cxHUqmZm1gbva9fLku,Running_Workout_Goals,RCA,u
122017,6oJ6le65B3SEqPwMRNXWjY,Running_Workout_Goals,Kygo,u


In [196]:
all_labels.to_csv('../spotify_data/all_tracks_with_labels.csv', index=False)