In [4]:
import pandas as pd
import numpy as np
import os, re
from datetime import date, timedelta

In [5]:
unique_tracks = pd.read_csv('../spotify_data/all_unique_track.csv')

In [6]:
unique_tracks.head()

Unnamed: 0,track_name,track_id,album_label
0,Billie Jean,5ChkMS8OtdzJeqyybCc9R5,Epic
1,Africa,2374M0fQpWi3dLnB54qaLX,UMC (Universal Music Catalogue)
2,Baba O'Riley - ConfidentialMX Remix,1DDEqSKrXxKkMNA1AfcJZa,Geffen
3,Sweet Dreams (Are Made of This) - Remastered,1TfqLAPs4K3s2rJMoCokcS,J Records
4,Don't Stop Believin',77NNZQSqzLNqh2A9JhLRkg,Sony Music Entertainment


In [10]:
label_list = unique_tracks[['album_label']]
print((len(label_list[label_list['album_label'].isna()])/len(label_list)) * 100, '% labels missing')

2.2314297124600637 % labels missing


## Record Label Statistics
#### Quick Summary:
- Surprising amount of unique record labels... perhaps suspiciously so
- Around 3/5 of unique songs have a unique record label
- Only 23 labels with more than 100 unique songs
- Spotify Curated lists have more one-track label apperances than top-10 label track appearances
- User Curated lists have more top-10 label track appearances than one-track label appearances

The label tags probably require a closer look and a cleaning

In [13]:
labels_sorted.tail()

Unnamed: 0,label,count
990,Futuristic Lingo,1
2770,Piano Novel,1
989,HACKED,1
988,908811 Records DK2,1
4923,Brushfire Records/Universal,1


In [19]:
label_dict = {}

for i in range(0,len(label_list)):
    labels = label_list.iloc[i,0]
    if labels is np.nan: continue
    labels = labels.split('/')
    for label in labels:
        label_dict[label] = label_dict.get(label, 0) + 1

labels = pd.DataFrame.from_dict(label_dict, orient = 'index').reset_index()
labels.columns = ['label', 'count']

In [20]:
labels_sorted = labels.sort_values(by=['count'], ascending=False)
labels_sorted.head(5)

Unnamed: 0,label,count
6,Columbia,759
4,Sony Music Entertainment,526
1938,Chillhop Records,320
338,Armada Music Bundles,268
50,RCA Records Label,251


In [21]:
labels_sorted.describe()

Unnamed: 0,count
count,5021.0
mean,4.282215
std,19.101371
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,759.0


In [22]:
print('Amount of unique record labels: %s (%s unique songs)\n' % (len(labels_sorted), len(unique_tracks)))
print('Most popular labels: ')
for i in range(0,10):
    print('\t%s: %s tracks'  % (labels_sorted.iloc[i,0], labels_sorted.iloc[i,1]))

Amount of unique record labels: 5021 (20032 unique songs)

Most popular labels: 
	Columbia: 759 tracks
	Sony Music Entertainment: 526 tracks
	Chillhop Records: 320 tracks
	Armada Music Bundles: 268 tracks
	RCA Records Label: 251 tracks
	Epic: 234 tracks
	RCA Records: 219 tracks
	Lowly: 208 tracks
	Atlantic Records: 200 tracks
	Now! Music: 171 tracks


In [23]:
print('Amount of labels with one track: %s' % len(labels_sorted[labels_sorted['count'] == 1]))
print('\nAmount of labels with less than 5 tracks: %s' % len(labels_sorted[labels_sorted['count'] < 5]))
print('Amount of labels with less than 10 tracks: %s' % len(labels_sorted[labels_sorted['count'] < 10]))
print('Amount of labels with less than 25 tracks: %s' % len(labels_sorted[labels_sorted['count'] < 25]))
print('\nAmount of labels with more than 50 tracks: %s' % len(labels_sorted[labels_sorted['count'] > 50]))
print('Amount of labels with more than 100 tracks: %s' % len(labels_sorted[labels_sorted['count'] > 100]))

Amount of labels with one track: 3151

Amount of labels with less than 5 tracks: 4343
Amount of labels with less than 10 tracks: 4697
Amount of labels with less than 25 tracks: 4894

Amount of labels with more than 50 tracks: 61
Amount of labels with more than 100 tracks: 25


In [24]:
# function to count amount playlist appearances of a label or group of labels' songs 
def get_amount_of_appearances(tracks):
    #tracks = unique_tracks[unique_tracks['album_label'] == label]
    rootdir = '../spotify_data/playlist_tracks/spotify_curated/'
    spotify_count = 0
    user_count = 0
    spotify_tracks = 0
    user_tracks = 0
    total_lists = 0
    for subdir, dirs, files in os.walk(rootdir):
            # Regex to avoid stepping into additions_and_removals dirs when walking through
            test_string = '.*\/additions_and_removals'
            test = re.fullmatch(test_string, format(subdir))

            if test: continue
            if subdir == rootdir: continue

            file_date = date(2019, 5, 12)

            playlist_name = os.path.basename(str(subdir))

            for x in range(len(files)-1):
                playlist = pd.read_csv(subdir + '/' + str(file_date) + '.csv')
                spotify_count += len(pd.merge(tracks[['track_id']], playlist[['track_id']], how='inner'))
                spotify_tracks += len(playlist)
                if file_date == date(2019, 8, 25):
                    break
                else:
                    file_date = file_date + timedelta(days=7)
                    
    rootdir = '../spotify_data/playlist_tracks/user_curated/'               
    for subdir, dirs, files in os.walk(rootdir):
            # Regex to avoid stepping into additions_and_removals dirs when walking through
            test_string = '.*\/additions_and_removals'
            test = re.fullmatch(test_string, format(subdir))

            if test: continue
            if subdir == rootdir: continue

            file_date = date(2019, 5, 12)

            playlist_name = os.path.basename(str(subdir))

            for x in range(len(files)-1):
                playlist = pd.read_csv(subdir + '/' + str(file_date) + '.csv')
                user_count += len(pd.merge(tracks[['track_id']], playlist[['track_id']], how='inner'))
                user_tracks += len(playlist)
                
                if file_date == date(2019, 8, 25):
                    break
                else:
                    file_date = file_date + timedelta(days=7)
    
    print('\tSpotify Curated Appearances: %s of %s total tracks' % (spotify_count, spotify_tracks))
    print('\tUser Curated Appearances: %s of %s total tracks' % (user_count, user_tracks))

In [132]:
# one tracks as all the tracks with a one-song label
one_track_label = labels_sorted[labels_sorted['count'] == 1]
unique_tracks.columns = ['track_name','track_id','label']
one_tracks = pd.merge(one_track_label[['label']], unique_tracks[['track_id','label']], how='inner')

In [154]:
# top 10 labels list of tracks
top_10_labels = labels_sorted.iloc[0:10]
top_10_tracks = pd.merge(top_10_labels[['label']], unique_tracks[['track_id','label']], how='inner')
len(top_10_tracks)

2804

There seem to be collaborations between record labels, where the labels are written as label/label/label

In [191]:
count = 0
for i in range(0,len(labels_sorted)):
    test_string = '[\w\s]+(\/\w+)+'
    label = labels_sorted.iloc[i,0]
    test = re.fullmatch(test_string, str(label))
    if test:
        count += 1
print('Ex: ', label)
print(count)

Ex:  Brushfire Records/Universal
110


110 of them. I'll figure out a way to deal with this. I don't think they should be unique labels.
Cheng: dealt with them on Nov 3, 2019

### Looking at 'one-track' label track presence vs. top-10 label track presence on spotify and user curated lists

In [162]:
print('One-Track Label Appearances: ')
get_amount_of_appearances(one_tracks)
print('\nTop 10 Label Appearances: ')
get_amount_of_appearances(top_10_tracks)

One-Track Label Appearances: 
	Spotify Curated Appearances: 14601 of 96553 total tracks
	User Curated Appearances: 8270 of 102099 total tracks

Top 10 Label Appearances: 
	Spotify Curated Appearances: 9798 of 96553 total tracks
	User Curated Appearances: 20839 of 102099 total tracks
