# Music Popularity Analysis

<hr style="border:2px solid black"> </hr>

## Notebook 01 - Data Prep

---

### Import libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time

In [2]:
def num_uniques(ser):
    try:
        return len(ser.unique())
    except:
        return "Not unique check-able"

In [3]:
def summarize_df(df):
    print("======DATA SUMMARY======")
    print("{} rows by {} columns".format(df.shape[0], df.shape[1]))
    print("\n======COLUMNS======")
    print(df.dtypes)
    print("\n======PREVIEW======")
    display(df.head())
    print("\n======NUMERICAL COL SUMMARY======")
    print(df.describe())
    print("\n")
    for col in df.columns:
        print("{}: {} unique values".format(col, num_uniques(df[col])))

---
### Import data

In [4]:
# Releases - album info
# Field names from MB website
rel_names = ['release_id', 'release_gid', 'release_name', 'artist_credit_id', 'barcode']
releases = pd.read_csv('../data/mbdump/mbdump/release', delimiter='\t', names=rel_names, usecols=[0, 1, 2, 3,  9])

In [5]:
summarize_df(releases)

3187458 rows by 5 columns

release_id           int64
release_gid         object
release_name        object
artist_credit_id     int64
barcode             object
dtype: object



Unnamed: 0,release_id,release_gid,release_name,artist_credit_id,barcode
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,60,\N
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,60,\N
2,3257193,6072a02d-e3cb-4f6a-b29c-526e8a0c4873,Kriminaltango et al,1,\N
3,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,60,\N
4,26,dd245091-b21e-48a3-b59a-f9b8ed8a0469,Demons,20211,\N



         release_id  artist_credit_id
count  3.187458e+06      3.187458e+06
mean   1.747545e+06      9.400387e+05
std    9.706914e+05      1.005470e+06
min    1.000000e+00      0.000000e+00
25%    9.272332e+05      5.019100e+04
50%    1.770142e+06      5.247780e+05
75%    2.584914e+06      1.630287e+06
max    3.395452e+06      3.256662e+06


release_id: 3187458 unique values
release_gid: 3187458 unique values
release_name: 2010149 unique values
artist_credit_id: 823692 unique values
barcode: 1207777 unique values


In [6]:
# Artist credit
# Field names from MB website
ac_names = ['artist_credit_id', 'artist_credit_name']
artist_credit = pd.read_csv('../data/mbdump/mbdump/artist_credit', delimiter='\t', names=ac_names, usecols=[0, 1])

In [7]:
summarize_df(artist_credit)

2268032 rows by 2 columns

artist_credit_id       int64
artist_credit_name    object
dtype: object



Unnamed: 0,artist_credit_id,artist_credit_name
0,202094,Ani & Beau
1,485926,The Ascient Rebels
2,1009634,Charlie Parker feat. Dizzy Gillespie
3,548770,Dead Blood
4,322256,Dr. Kreator



       artist_credit_id
count      2.268032e+06
mean       1.809475e+06
std        9.221180e+05
min        1.000000e+00
25%        1.076352e+06
50%        1.899412e+06
75%        2.625079e+06
max        3.256662e+06


artist_credit_id: 2268032 unique values
artist_credit_name: 2155495 unique values


In [8]:
rel_credit = releases.merge(artist_credit, on='artist_credit_id')

In [9]:
summarize_df(rel_credit)

3185994 rows by 6 columns

release_id             int64
release_gid           object
release_name          object
artist_credit_id       int64
barcode               object
artist_credit_name    object
dtype: object



Unnamed: 0,release_id,release_gid,release_name,artist_credit_id,barcode,artist_credit_name
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,60,\N,Tori Amos
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,60,\N,Tori Amos
2,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,60,\N,Tori Amos
3,3,4c31aef1-177e-4bba-9a56-180e21a0d043,Boys for Pele,60,\N,Tori Amos
4,234,6b0386ee-e412-44a9-8d3c-ad54fe5cbdce,Ultra Rare Tori,60,\N,Tori Amos



         release_id  artist_credit_id
count  3.185994e+06      3.185994e+06
mean   1.747442e+06      9.399492e+05
std    9.706978e+05      1.005442e+06
min    1.000000e+00      1.000000e+00
25%    9.271302e+05      5.015500e+04
50%    1.769956e+06      5.247410e+05
75%    2.584805e+06      1.630081e+06
max    3.395452e+06      3.256662e+06


release_id: 3185994 unique values
release_gid: 3185994 unique values
release_name: 2009308 unique values
artist_credit_id: 823349 unique values
barcode: 1207286 unique values
artist_credit_name: 780027 unique values


In [10]:
# Release country
# Field names from MB website
rel_country_names = ['release_id', 'date_year']
release_country = pd.read_csv('../data/mbdump/mbdump/release_country', delimiter='\t', names=rel_country_names, usecols=[0, 2])

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
summarize_df(release_country)

8002924 rows by 2 columns

release_id     int64
date_year     object
dtype: object



Unnamed: 0,release_id,date_year
0,3,1997
1,1427792,2014
2,9,2002
3,10,2002
4,11,1999



         release_id
count  8.002924e+06
mean   2.503444e+06
std    8.494224e+05
min    1.000000e+00
25%    2.296257e+06
50%    2.800541e+06
75%    3.067161e+06
max    3.395452e+06


release_id: 2811760 unique values
date_year: 215 unique values


In [12]:
# Artist credit
# Field names from MB website
relu_country_names = ['release_id', 'date_year']
release_unkcountry = pd.read_csv('../data/mbdump/mbdump/release_unknown_country', delimiter='\t', names=relu_country_names, usecols=[0, 1])

In [13]:
summarize_df(release_unkcountry)

222660 rows by 2 columns

release_id     int64
date_year     object
dtype: object



Unnamed: 0,release_id,date_year
0,1372866,1998
1,1089598,2010
2,1147748,2011
3,1068236,2006
4,1148229,2008



         release_id
count  2.226600e+05
mean   2.121672e+06
std    7.946010e+05
min    5.652000e+03
25%    1.441166e+06
50%    2.174603e+06
75%    2.819434e+06
max    3.395442e+06


release_id: 222660 unique values
date_year: 143 unique values


In [14]:
rel_credit_year = rel_credit.merge(release_country, on='release_id', how='left')

In [15]:
summarize_df(rel_credit_year)

8369303 rows by 7 columns

release_id             int64
release_gid           object
release_name          object
artist_credit_id       int64
barcode               object
artist_credit_name    object
date_year             object
dtype: object



Unnamed: 0,release_id,release_gid,release_name,artist_credit_id,barcode,artist_credit_name,date_year
0,9,425cf29a-1490-43ab-abfa-7b17a2cec351,A Sorta Fairytale,60,\N,Tori Amos,2002
1,10,a96e1d03-e685-3627-8cba-f5b96be7158f,A Sorta Fairytale,60,\N,Tori Amos,2002
2,12,9660928f-7cd8-4fef-852d-5599dc4ad3ec,Silent All These Years,60,\N,Tori Amos,1997
3,3,4c31aef1-177e-4bba-9a56-180e21a0d043,Boys for Pele,60,\N,Tori Amos,1997
4,234,6b0386ee-e412-44a9-8d3c-ad54fe5cbdce,Ultra Rare Tori,60,\N,Tori Amos,1994



         release_id  artist_credit_id
count  8.369303e+06      8.369303e+06
mean   2.471698e+06      1.254456e+06
std    8.619784e+05      1.140624e+06
min    1.000000e+00      1.000000e+00
25%    2.162442e+06      7.992600e+04
50%    2.790087e+06      1.021157e+06
75%    3.060198e+06      2.408974e+06
max    3.395452e+06      3.256662e+06


release_id: 3185994 unique values
release_gid: 3185994 unique values
release_name: 2009308 unique values
artist_credit_id: 823349 unique values
barcode: 1207286 unique values
artist_credit_name: 780027 unique values
date_year: 216 unique values


In [16]:
rel_credit_year_na = rel_credit_year[rel_credit_year['date_year'].isna()].drop('date_year', axis=1)

In [17]:
rel_credit_year_gapfill = rel_credit_year_na.merge(release_unkcountry, on='release_id', how='inner')

In [18]:
rel_credit_year_filled = pd.concat([rel_credit_year[~rel_credit_year['date_year'].isna()], rel_credit_year_gapfill])
rel_credit_year_filled = rel_credit_year_filled[rel_credit_year_filled['date_year'] != '\\N']
rel_credit_year_filled['date_year'] = rel_credit_year_filled['date_year'].astype(int)
rel_credit_year_filled.sort_values(by='date_year', inplace=True)
rel_credit_year_filled = rel_credit_year_filled.groupby(['release_id']).first()

In [19]:
summarize_df(rel_credit_year_filled)

2959368 rows by 6 columns

release_gid           object
release_name          object
artist_credit_id       int64
barcode               object
artist_credit_name    object
date_year              int64
dtype: object



Unnamed: 0_level_0,release_gid,release_name,artist_credit_id,barcode,artist_credit_name,date_year
release_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,02232360-337e-4a3f-ad20-6cdd4c34288c,Little Earthquakes,60,075678235825,Tori Amos,1992
2,290e10c5-7efc-4f60-ba2c-0dfc0208fbf5,Under the Pink,60,075678256721,Tori Amos,1994
3,4c31aef1-177e-4bba-9a56-180e21a0d043,Boys for Pele,60,\N,Tori Amos,1997
5,8f468f36-8c7e-4fc1-9166-50664d267127,Dummy,65,042282855329,Portishead,1994
6,9cbf7040-dbdc-403c-940f-7562d9712514,To Venus and Back,60,075678324222,Tori Amos,1999



       artist_credit_id     date_year
count      2.959368e+06  2.959368e+06
mean       9.650916e+05  2.005660e+03
std        1.012027e+06  1.363515e+01
min        1.000000e+00  1.000000e+00
25%        5.387000e+04  1.999000e+03
50%        5.642290e+05  2.009000e+03
75%        1.673075e+06  2.016000e+03
max        3.256662e+06  3.006000e+03


release_gid: 2959368 unique values
release_name: 1864625 unique values
artist_credit_id: 771228 unique values
barcode: 1179505 unique values
artist_credit_name: 731222 unique values
date_year: 172 unique values


In [20]:
rel_credit_year_filter = rel_credit_year_filled[rel_credit_year_filled['date_year'] > 0]

In [21]:
summarize_df(rel_credit_year_filter)

2959368 rows by 6 columns

release_gid           object
release_name          object
artist_credit_id       int64
barcode               object
artist_credit_name    object
date_year              int64
dtype: object



Unnamed: 0_level_0,release_gid,release_name,artist_credit_id,barcode,artist_credit_name,date_year
release_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,02232360-337e-4a3f-ad20-6cdd4c34288c,Little Earthquakes,60,075678235825,Tori Amos,1992
2,290e10c5-7efc-4f60-ba2c-0dfc0208fbf5,Under the Pink,60,075678256721,Tori Amos,1994
3,4c31aef1-177e-4bba-9a56-180e21a0d043,Boys for Pele,60,\N,Tori Amos,1997
5,8f468f36-8c7e-4fc1-9166-50664d267127,Dummy,65,042282855329,Portishead,1994
6,9cbf7040-dbdc-403c-940f-7562d9712514,To Venus and Back,60,075678324222,Tori Amos,1999



       artist_credit_id     date_year
count      2.959368e+06  2.959368e+06
mean       9.650916e+05  2.005660e+03
std        1.012027e+06  1.363515e+01
min        1.000000e+00  1.000000e+00
25%        5.387000e+04  1.999000e+03
50%        5.642290e+05  2.009000e+03
75%        1.673075e+06  2.016000e+03
max        3.256662e+06  3.006000e+03


release_gid: 2959368 unique values
release_name: 1864625 unique values
artist_credit_id: 771228 unique values
barcode: 1179505 unique values
artist_credit_name: 731222 unique values
date_year: 172 unique values


In [22]:
# Medium

# Field names from MB website
medium_names = ['medium_id', 'release_id']
medium = pd.read_csv('../data/mbdump/mbdump/medium', delimiter='\t', names=medium_names, usecols=[0, 1]) 

In [23]:
summarize_df(medium)

3539822 rows by 2 columns

medium_id     int64
release_id    int64
dtype: object



Unnamed: 0,medium_id,release_id
0,288902,288902
1,600623,600623
2,600626,600626
3,600627,600627
4,7716,7716



          medium_id    release_id
count  3.539822e+06  3.539822e+06
mean   1.864473e+06  1.739269e+06
std    1.068518e+06  9.662437e+05
min    1.000000e+00  1.000000e+00
25%    9.371882e+05  9.257842e+05
50%    1.875542e+06  1.759694e+06
75%    2.791744e+06  2.568767e+06
max    3.700246e+06  3.395452e+06


medium_id: 3539822 unique values
release_id: 3159201 unique values


In [24]:
rel_credit_medium = rel_credit_year_filter.merge(medium, on='release_id', how='left')

In [25]:
summarize_df(rel_credit_medium)

3312578 rows by 8 columns

release_id              int64
release_gid            object
release_name           object
artist_credit_id        int64
barcode                object
artist_credit_name     object
date_year               int64
medium_id             float64
dtype: object



Unnamed: 0,release_id,release_gid,release_name,artist_credit_id,barcode,artist_credit_name,date_year,medium_id
0,1,02232360-337e-4a3f-ad20-6cdd4c34288c,Little Earthquakes,60,075678235825,Tori Amos,1992,1.0
1,2,290e10c5-7efc-4f60-ba2c-0dfc0208fbf5,Under the Pink,60,075678256721,Tori Amos,1994,2.0
2,3,4c31aef1-177e-4bba-9a56-180e21a0d043,Boys for Pele,60,\N,Tori Amos,1997,3.0
3,5,8f468f36-8c7e-4fc1-9166-50664d267127,Dummy,65,042282855329,Portishead,1994,5.0
4,6,9cbf7040-dbdc-403c-940f-7562d9712514,To Venus and Back,60,075678324222,Tori Amos,1999,6.0



         release_id  artist_credit_id     date_year     medium_id
count  3.312578e+06      3.312578e+06  3.312578e+06  3.282980e+06
mean   1.759020e+06      9.184492e+05  2.005826e+03  1.885176e+06
std    9.731627e+05      1.003783e+06  1.327776e+01  1.076825e+06
min    1.000000e+00      1.000000e+00  1.000000e+00  1.000000e+00
25%    9.460270e+05      3.859500e+04  2.000000e+03  9.577688e+05
50%    1.804590e+06      4.849755e+05  2.009000e+03  1.924062e+06
75%    2.589202e+06      1.601912e+06  2.015000e+03  2.816324e+06
max    3.395452e+06      3.256662e+06  3.006000e+03  3.700246e+06


release_id: 2959368 unique values
release_gid: 2959368 unique values
release_name: 1864625 unique values
artist_credit_id: 771228 unique values
barcode: 1179505 unique values
artist_credit_name: 731222 unique values
date_year: 172 unique values
medium_id: 3282981 unique values


In [26]:
# Track

# Field names from MB website
track_names = ['track_id', 'track_gid', 'recording_id', 'medium_id', 'track_name']
track = pd.read_csv('../data/mbdump/mbdump/track', delimiter='\t', names=track_names,
                   usecols=[0, 1, 2, 3, 6])

In [27]:
summarize_df(track)

36870587 rows by 5 columns

track_id         int64
track_gid       object
recording_id     int64
medium_id        int64
track_name      object
dtype: object



Unnamed: 0,track_id,track_gid,recording_id,medium_id,track_name
0,34228823,9b02977e-a03b-4a6b-a9a9-06e722bdcd7a,428644,3254461,The Ghost of Tom Joad
1,81,43da7544-6283-3159-84f9-537fe823a1a7,11,600623,Five Man Army
2,35831997,0b6b6283-a5a8-4560-9fa8-f68a430d86ea,25849634,3434937,Wonder Girl
3,99,fa124f9a-d8ea-36a3-bed3-c817fdbe13e2,11,600626,Five Man Army
4,108,e56c6d3c-09cf-33a0-81c5-ceade77c35dc,11,600627,Five Man Army



           track_id  recording_id     medium_id
count  3.687059e+07  3.687059e+07  3.687059e+07
mean   1.902471e+07  1.501142e+07  1.764107e+06
std    1.102906e+07  9.452138e+06  1.053328e+06
min    1.000000e+00  5.000000e+00  1.000000e+00
25%    9.425090e+06  6.786869e+06  8.477570e+05
50%    1.903365e+07  1.484385e+07  1.722711e+06
75%    2.859518e+07  2.313009e+07  2.657356e+06
max    3.808780e+07  3.184561e+07  3.700246e+06


track_id: 36870587 unique values
track_gid: 36870587 unique values
recording_id: 26556854 unique values
medium_id: 3567385 unique values
track_name: 14126401 unique values


In [28]:
# Recording

# Field names from MB website
recording_names = ['recording_id', 'recording_gid', 'comment']
recording = pd.read_csv('../data/mbdump/mbdump/recording', delimiter='\t', names=recording_names,
                   usecols=[0, 1, 5])

In [29]:
summarize_df(recording)

26641864 rows by 3 columns

recording_id      int64
recording_gid    object
comment          object
dtype: object



Unnamed: 0,recording_id,recording_gid,comment
0,20937085,0f42ab32-22cd-4dcf-927b-a8d9a183d68b,
1,20937086,4dce8f93-45ee-4573-8558-8cd321256233,
2,20937087,48fabe3f-0fbd-4145-a917-83d164d6386f,
3,11,b30b9943-9100-4d84-9ad2-69859ea88fbb,
4,20937088,b55f1db3-c6d2-4645-b908-03e1017a99c2,



       recording_id
count  2.664186e+07
mean   1.693315e+07
std    9.000384e+06
min    5.000000e+00
25%    9.408009e+06
50%    1.729830e+07
75%    2.475202e+07
max    3.184561e+07


recording_id: 26641864 unique values
recording_gid: 26641864 unique values
comment: 89511 unique values


In [30]:
# ISRC

# Field names from MB website
isrc_names = ['recording_id', 'isrc']
isrc = pd.read_csv('../data/mbdump/mbdump/isrc', delimiter='\t', names=isrc_names, usecols=[1, 2])

In [31]:
summarize_df(isrc)

2236883 rows by 2 columns

recording_id     int64
isrc            object
dtype: object



Unnamed: 0,recording_id,isrc
0,10,GBAAA9000038
1,11,GBAAA9100082
2,14,GBAAA9100044
3,15,GBAAA9100081
4,16,GBAAA9100069



       recording_id
count  2.236883e+06
mean   1.667817e+07
std    9.791918e+06
min    1.000000e+01
25%    8.154790e+06
50%    1.706842e+07
75%    2.575915e+07
max    3.184535e+07


recording_id: 2170274 unique values
isrc: 2173524 unique values


In [32]:
recording_isrc = recording.merge(isrc, on='recording_id')

In [33]:
summarize_df(recording_isrc)

2234677 rows by 4 columns

recording_id      int64
recording_gid    object
comment          object
isrc             object
dtype: object



Unnamed: 0,recording_id,recording_gid,comment,isrc
0,20937085,0f42ab32-22cd-4dcf-927b-a8d9a183d68b,,SE3OH1730478
1,11,b30b9943-9100-4d84-9ad2-69859ea88fbb,,GBAAA9100082
2,11,b30b9943-9100-4d84-9ad2-69859ea88fbb,,GBAAA1200648
3,17,c5355127-7a0c-428a-bd39-e5b3e83250f7,,GBAAA9100083
4,17,c5355127-7a0c-428a-bd39-e5b3e83250f7,,GBAAA1200651



       recording_id
count  2.234677e+06
mean   1.667452e+07
std    9.791008e+06
min    1.000000e+01
25%    8.153281e+06
50%    1.706320e+07
75%    2.575500e+07
max    3.184535e+07


recording_id: 2168122 unique values
recording_gid: 2168122 unique values
comment: 16745 unique values
isrc: 2171453 unique values


In [43]:
mb_db_songs = track.merge(recording_isrc, on='recording_id')
mb_db_songs = mb_db_songs.merge(rel_credit_medium, on='medium_id', how='left')

In [44]:
summarize_df(mb_db_songs)

8369534 rows by 15 columns

track_id                int64
track_gid              object
recording_id            int64
medium_id               int64
track_name             object
recording_gid          object
comment                object
isrc                   object
release_id            float64
release_gid            object
release_name           object
artist_credit_id      float64
barcode                object
artist_credit_name     object
date_year             float64
dtype: object



Unnamed: 0,track_id,track_gid,recording_id,medium_id,track_name,recording_gid,comment,isrc,release_id,release_gid,release_name,artist_credit_id,barcode,artist_credit_name,date_year
0,34228823,9b02977e-a03b-4a6b-a9a9-06e722bdcd7a,428644,3254461,The Ghost of Tom Joad,b5a99727-7943-4207-a125-ff04a49a345c,version 1,USSM19501784,2990206.0,2a05dd91-6dab-4ba6-857b-cd4f39f0c418,The Ghost of Tom Joad,813.0,5099748165046.0,Bruce Springsteen,1995.0
1,34229049,46f7bfe6-face-48fd-9197-dd83c09da310,428644,3254474,The Ghost of Tom Joad,b5a99727-7943-4207-a125-ff04a49a345c,version 1,USSM19501784,2990216.0,0fead4a4-fbf9-447e-b62f-3ba099df4154,The Ghost of Tom Joad,813.0,5464237628.0,Bruce Springsteen,1995.0
2,34229209,71755d1b-02b2-410e-9171-c55dc8c20ab5,428644,3254483,The Ghost of Tom Joad,b5a99727-7943-4207-a125-ff04a49a345c,version 1,USSM19501784,2990225.0,de0af5e8-51e2-4a5c-aa3a-ee62a8ddbc19,The Ghost of Tom Joad,813.0,,Bruce Springsteen,1995.0
3,34229302,c5b1b539-407c-4cb6-b33d-896f60363809,428644,3254490,The Ghost of Tom Joad,b5a99727-7943-4207-a125-ff04a49a345c,version 1,USSM19501784,2990228.0,52ddc0cb-9556-42fa-a074-799b0905803f,The Ghost of Tom Joad,813.0,74646748442.0,Bruce Springsteen,1995.0
4,34229329,c36f13f7-7b6c-4fe3-b835-5fadc169a91d,428644,3254494,The Ghost of Tom Joad,b5a99727-7943-4207-a125-ff04a49a345c,version 1,USSM19501784,2990232.0,e89a816a-a377-47b7-bc36-b2eaaaf4049b,The Ghost of Tom Joad,813.0,,Bruce Springsteen,1995.0



           track_id  recording_id     medium_id    release_id  \
count  8.369534e+06  8.369534e+06  8.369534e+06  7.859258e+06   
mean   1.912210e+07  1.029665e+07  1.765118e+06  1.636912e+06   
std    1.129683e+07  9.865615e+06  1.084838e+06  9.898241e+05   
min    1.800000e+01  1.000000e+01  1.000000e+00  1.000000e+00   
25%    1.005981e+07  8.367530e+05  7.830400e+05  7.310040e+05   
50%    1.918438e+07  7.493556e+06  1.737875e+06  1.633712e+06   
75%    2.925082e+07  1.779992e+07  2.727043e+06  2.504288e+06   
max    3.808768e+07  3.184535e+07  3.700239e+06  3.395447e+06   

       artist_credit_id     date_year  
count      7.859258e+06  7.859258e+06  
mean       4.708365e+05  2.006253e+03  
std        8.086966e+05  1.105813e+01  
min        1.000000e+00  1.902000e+03  
25%        3.030000e+02  2.000000e+03  
50%        2.172400e+04  2.008000e+03  
75%        6.276610e+05  2.015000e+03  
max        3.256603e+06  2.222000e+03  


track_id: 6849501 unique values
track_gid: 6849501 

In [45]:
mb_db_songs.to_parquet('mb_db_songs.parquet')
mb_db_songs[mb_db_songs['date_year'] >= 2019].to_parquet('mb_db_songs_2019.parquet')

In [37]:
# mb_db_song_sample = mb_db_songs[mb_db_songs['date_year'] >= 2019].sample(frac=.10)

In [38]:
# mb_db_song_sample.to_parquet('mb_db_songs_sample.parquet')

In [39]:
# summarize_df(mb_db_song_sample)

In [40]:
# Write out isrcs
# mb_db_isrcs = mb_db_song_sample['isrc'].unique()
# np.savetxt('mb_db_isrcs.csv', mb_db_isrcs, delimiter=',', fmt="%s")