# Music Popularity Analysis

<hr style="border:2px solid black"> </hr>

## Notebook 01 - Data Prep

---

### Import libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time

In [2]:
def num_uniques(ser):
    try:
        return len(ser.unique())
    except:
        return "Not unique check-able"

In [3]:
def summarize_df(df):
    print("======DATA SUMMARY======")
    print("{} rows by {} columns".format(df.shape[0], df.shape[1]))
    print("\n======COLUMNS======")
    print(df.dtypes)
    print("\n======PREVIEW======")
    display(df.head())
    print("\n======NUMERICAL COL SUMMARY======")
    print(df.describe())
    print("\n")
    for col in df.columns:
        print("{}: {} unique values".format(col, num_uniques(df[col])))

---
### Import data

In [4]:
# Releases - album info
# Field names from MB website
rel_names = ['release_id', 'release_gid', 'release_name', 'artist_credit_id', 'barcode']
releases = pd.read_csv('../data/mbdump/mbdump/release', delimiter='\t', names=rel_names, usecols=[0, 1, 2, 3,  9])

In [5]:
summarize_df(releases)

3187458 rows by 5 columns

release_id           int64
release_gid         object
release_name        object
artist_credit_id     int64
barcode             object
dtype: object



Unnamed: 0,release_id,release_gid,release_name,artist_credit_id,barcode
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,60,\N
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,60,\N
2,3257193,6072a02d-e3cb-4f6a-b29c-526e8a0c4873,Kriminaltango et al,1,\N
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,60,\N
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,20211,\N



         release_id  artist_credit_id
count  3.187458e+06      3.187458e+06
mean   1.747545e+06      9.400387e+05
std    9.706914e+05      1.005470e+06
min    1.000000e+00      0.000000e+00
25%    9.272332e+05      5.019100e+04
50%    1.770142e+06      5.247780e+05
75%    2.584914e+06      1.630287e+06
max    3.395452e+06      3.256662e+06


release_id: 3187458 unique values
release_gid: 3187458 unique values
release_name: 2010149 unique values
artist_credit_id: 823692 unique values
barcode: 1207777 unique values


In [6]:
# Artist credit
# Field names from MB website
ac_names = ['artist_credit_id', 'artist_credit_name']
artist_credit = pd.read_csv('../data/mbdump/mbdump/artist_credit', delimiter='\t', names=ac_names, usecols=[0, 1])

In [7]:
summarize_df(artist_credit)

2268032 rows by 2 columns

artist_credit_id       int64
artist_credit_name    object
dtype: object



Unnamed: 0,artist_credit_id,artist_credit_name
0,202094,Ani & Beau
1,485926,The Ascient Rebels
2,1009634,Charlie Parker feat. Dizzy Gillespie
3,548770,Dead Blood
4,322256,Dr. Kreator



       artist_credit_id
count      2.268032e+06
mean       1.809475e+06
std        9.221180e+05
min        1.000000e+00
25%        1.076352e+06
50%        1.899412e+06
75%        2.625079e+06
max        3.256662e+06


artist_credit_id: 2268032 unique values
artist_credit_name: 2155495 unique values


In [8]:
rel_credit = releases.merge(artist_credit, on='artist_credit_id')

In [9]:
summarize_df(rel_credit)

3185994 rows by 6 columns

release_id             int64
release_gid           object
release_name          object
artist_credit_id       int64
barcode               object
artist_credit_name    object
dtype: object



Unnamed: 0,release_id,release_gid,release_name,artist_credit_id,barcode,artist_credit_name
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,60,\N,Tori Amos
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,60,\N,Tori Amos
2,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,60,\N,Tori Amos
3,3,4c31aef1-177e-4bba-9a56-180e21a0d043,Boys for Pele,60,\N,Tori Amos
4,234,6b0386ee-e412-44a9-8d3c-ad54fe5cbdce,Ultra Rare Tori,60,\N,Tori Amos



         release_id  artist_credit_id
count  3.185994e+06      3.185994e+06
mean   1.747442e+06      9.399492e+05
std    9.706978e+05      1.005442e+06
min    1.000000e+00      1.000000e+00
25%    9.271302e+05      5.015500e+04
50%    1.769956e+06      5.247410e+05
75%    2.584805e+06      1.630081e+06
max    3.395452e+06      3.256662e+06


release_id: 3185994 unique values
release_gid: Not unique check-able unique values
release_name: 2009308 unique values
artist_credit_id: 823349 unique values
barcode: 1207286 unique values
artist_credit_name: 780027 unique values


In [10]:
# Medium

# Field names from MB website
medium_names = ['medium_id', 'release_id']
medium = pd.read_csv('../data/mbdump/mbdump/medium', delimiter='\t', names=medium_names, usecols=[0, 1]) 

In [11]:
summarize_df(medium)

3539822 rows by 2 columns

medium_id     int64
release_id    int64
dtype: object



Unnamed: 0,medium_id,release_id
0,288902,288902
1,600623,600623
2,600626,600626
3,600627,600627
4,7716,7716



          medium_id    release_id
count  3.539822e+06  3.539822e+06
mean   1.864473e+06  1.739269e+06
std    1.068518e+06  9.662437e+05
min    1.000000e+00  1.000000e+00
25%    9.371882e+05  9.257842e+05
50%    1.875542e+06  1.759694e+06
75%    2.791744e+06  2.568767e+06
max    3.700246e+06  3.395452e+06


medium_id: 3539822 unique values
release_id: 3159201 unique values


In [12]:
rel_credit_medium = rel_credit.merge(medium, on='release_id', how='left')

KeyboardInterrupt: 

In [None]:
summarize_df(rel_upc_credit_medium)

In [None]:
# Track

# Field names from MB website
track_names = ['track_id', 'track_gid', 'recording_id', 'medium_id', 'track_name']
track = pd.read_csv('../data/mbdump/mbdump/track', delimiter='\t', names=track_names,
                   usecols=[0, 1, 2, 3, 6])

In [None]:
summarize_df(track)

In [None]:
# Recording

# Field names from MB website
recording_names = ['recording_id']
recording = pd.read_csv('../data/mbdump/mbdump/recording', delimiter='\t', names=recording_names,
                   usecols=[0])

In [None]:
summarize_df(recording)

In [None]:
# ISRC

# Field names from MB website
isrc_names = ['recording_id', 'isrc']
isrc = pd.read_csv('../data/mbdump/mbdump/isrc', delimiter='\t', names=isrc_names, usecols=[0, 2])

In [None]:
summarize_df(isrc)

In [None]:
recording_isrc = recording.merge(isrc, on='recording_id')
mb_db_songs = track.merge(recording_isrc, on='recording_id')
mb_db_songs = mb_db_songs.merge(rel_credit_medium, on='medium_id')

In [None]:
summarize_df(mb_db_songs)

In [None]:
# Write out barcodes & full file
mb_db_isrcs = mb_db_songs['isrc'].unique()
np.savetxt('mb_db_isrcs.csv', mb_db_isrcs, delimiter=',')