In [1]:
import pandas as pd
from data_cleaning_helpers import *

In [22]:
file_path = 'electronic_df.pkl'
electronic_df = pd.read_pickle(file_path)
df = electronic_df.copy()

In [23]:
# this dataframe has the metadata features removed
metadata_feat_names = [feat_name for feat_name in df.keys() if feat_name.startswith('metadata')]
df1 = df.drop(columns=metadata_feat_names)
# dropping duplicate mbid
df2 = df1.drop_duplicates(subset=['mbdata.id'])
# removing any potential audiobooks
df3 = remove_phrase(df2, 'audiobook')

In [24]:
len(set(electronic_df['mbdata.id'].to_list()))

39749

In [25]:
# creating new columns for artist name and id
df3['mbdata.artist-name'] = df3['mbdata.artist-credit'].apply(lambda x: x[0]['artist']['name'])
df3['mbdata.artist-id'] = df3['mbdata.artist-credit'].apply(lambda x: x[0]['artist']['id'])
# normalizing capitalization
df3['mbdata.title'] = df3['mbdata.title'].apply(lambda x: x.lower())

In [26]:
# dropping duplicates by title + artist-id
df4 = df3.drop_duplicates(subset=['mbdata.title', 'mbdata.artist-id'], keep='first')
# spreading out tags into a single set
df4['mbdata.all-tags'] = df4['mbdata.tags'].apply(lambda x: genre_extractor(x))
# replacing '&' with 'and' in mbdata.all-tags, mainly to account for genre 'drum & bass'
df4['mbdata.all-tags'] = df4['mbdata.all-tags'].apply(lambda x: setstring_replace(x, '&', 'and'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['mbdata.all-tags'] = df4['mbdata.tags'].apply(lambda x: genre_extractor(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['mbdata.all-tags'] = df4['mbdata.all-tags'].apply(lambda x: setstring_replace(x, '&', 'and'))


In [27]:
# Creating the final genre feature.  Based on the original tags,
# this attempts to label which genre(s) the recording falls into
# among house, drum and bass, techno, and trance

df4['mbdata.genre'] = df4['mbdata.all-tags'].apply(genre_labeler)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['mbdata.genre'] = df4['mbdata.all-tags'].apply(genre_labeler)


In [28]:
df4.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 37037 entries, 0 to 40126
Data columns (total 632 columns):
 #    Column                                           Non-Null Count  Dtype  
---   ------                                           --------------  -----  
 0    lowlevel.average_loudness                        37037 non-null  float64
 1    lowlevel.barkbands.dmean                         37037 non-null  object 
 2    lowlevel.barkbands.dmean2                        37037 non-null  object 
 3    lowlevel.barkbands.dvar                          37037 non-null  object 
 4    lowlevel.barkbands.dvar2                         37037 non-null  object 
 5    lowlevel.barkbands.max                           37037 non-null  object 
 6    lowlevel.barkbands.mean                          37037 non-null  object 
 7    lowlevel.barkbands.median                        37037 non-null  object 
 8    lowlevel.barkbands.min                           37037 non-null  object 
 9    lowlevel.barkbands.v

In [30]:
df4['mbdata.genre'].value_counts()

mbdata.genre
{house}                                   8776
{drum and bass}                           7121
{techno}                                  7092
{trance}                                  6547
{house, trance}                           2259
{techno, house}                           2227
{drum and bass, house}                     890
{techno, trance}                           742
{techno, house, trance}                    400
{drum and bass, techno}                    379
{drum and bass, house, trance}             199
{drum and bass, trance}                    166
{drum and bass, techno, house}             134
{drum and bass, techno, house, trance}      70
{drum and bass, techno, trance}             35
Name: count, dtype: int64

In [11]:
df4['mbdata.id']

0    000d5835-c0b5-4379-9026-0299f861a1a9
0    00202f40-1765-4cef-bd6a-69cd20424f06
0    002453bb-78f0-44df-aaa6-2f909183f092
0    0024d72c-136f-49f2-9078-ce4b39b94d3f
0    00286661-c7fd-45ca-9628-32a3fbae146c
0    002b2228-05c0-4dc4-aff6-c13730b33b09
0    00300812-2770-44e3-bb63-28933fdd3ee6
0    003304eb-5bb9-498e-8185-c6b220e75692
0    0035f291-c8e3-45fc-8847-b57b76998e05
0    0036b402-dcbb-4fc7-8ca5-682599049ee9
0    00382140-1d20-47d8-975a-96d0366e76da
0    00382198-a64f-48e1-86ae-b712cb3b15a8
0    003c9192-b22b-4a2c-b49a-50daae61673b
0    004158ee-d016-48db-95dc-d55b9bbfb7ca
0    00421814-de53-49a3-8339-490cd69df292
0    00468335-01ae-49b0-972c-584ae2f9b3ca
0    00041ff8-c3dc-41d9-adbe-0ff3c7a810c7
0    00066fb5-ac5c-4742-b12b-8052d300424a
0    0017c178-7ee3-49c2-969d-d4f333af6d44
0    001c344c-0a77-493e-b330-188227ce125e
0    00298280-e555-49a0-94b2-4a26bd7355e5
0    0038c26e-6572-4985-8979-ad0da4afdf3f
0    0049e030-3051-463f-abab-e39e473d9f46
0    004d559a-1670-4f28-90bc-aa185

In [8]:
df4.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 60 entries, 0 to 0
Data columns (total 2666 columns):
 #     Column                                           Non-Null Count  Dtype  
---    ------                                           --------------  -----  
 0     tonal.tuning_nontempered_energy_ratio            60 non-null     float64
 1     tonal.tuning_frequency                           60 non-null     float64
 2     tonal.tuning_equal_tempered_deviation            60 non-null     float64
 3     tonal.tuning_diatonic_strength                   60 non-null     float64
 4     tonal.thpcp.35                                   60 non-null     float64
 5     tonal.thpcp.34                                   60 non-null     float64
 6     tonal.thpcp.33                                   60 non-null     float64
 7     tonal.thpcp.32                                   60 non-null     float64
 8     tonal.thpcp.31                                   60 non-null     float64
 9     tonal.thpcp.30

In [None]:
# Removing uncontentious features
remove_cols = ['mbdata.video', 'mbdata.isrcs', ]