In [70]:
import pandas as pd
from data_cleaning_helpers import *

In [71]:
file_path = 'electronic_df.pkl'
electronic_df = pd.read_pickle(file_path)
df = electronic_df.copy()

In [72]:
# this dataframe has the metadata features removed
metadata_feat_names = [feat_name for feat_name in df.keys() if feat_name.startswith('metadata')]
df1 = df.drop(columns=metadata_feat_names)
# dropping duplicate mbid
df2 = df1.drop_duplicates(subset=['mbdata.id'])
# removing any potential audiobooks
df3 = remove_phrase(df2, 'audiobook')

In [None]:
# checking the number of unique mbids as a sanity check
len(set(electronic_df['mbdata.id'].to_list()))

In [73]:
# creating new columns for artist name and id
df3['mbdata.artist-name'] = df3['mbdata.artist-credit'].apply(lambda x: x[0]['artist']['name'])
df3['mbdata.artist-id'] = df3['mbdata.artist-credit'].apply(lambda x: x[0]['artist']['id'])
# normalizing capitalization on the titles
df3['mbdata.title'] = df3['mbdata.title'].apply(lambda x: x.lower())

In [74]:
# dropping duplicates by title + artist-id
df4 = df3.drop_duplicates(subset=['mbdata.title', 'mbdata.artist-id'], keep='first')
# spreading out tags into a single set
df4['mbdata.all-tags'] = df4['mbdata.tags'].apply(lambda x: genre_extractor(x))
# replacing '&' with 'and' in mbdata.all-tags, mainly to account for genre 'drum & bass'
df4['mbdata.all-tags'] = df4['mbdata.all-tags'].apply(lambda x: setstring_replace(x, '&', 'and'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['mbdata.all-tags'] = df4['mbdata.tags'].apply(lambda x: genre_extractor(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['mbdata.all-tags'] = df4['mbdata.all-tags'].apply(lambda x: setstring_replace(x, '&', 'and'))


In [75]:
# Creating the final genre feature.  Based on the original tags,
# this attempts to label which genre(s) the recording falls into
# among house, drum and bass, techno, and trance

df4['mbdata.genre'] = df4['mbdata.all-tags'].apply(genre_labeler)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['mbdata.genre'] = df4['mbdata.all-tags'].apply(genre_labeler)


In [76]:
df4['mbdata.genre'].value_counts()

mbdata.genre
{house}                                   8720
{drum and bass}                           7112
{techno}                                  7072
{trance}                                  6608
{house, trance}                           2304
{house, techno}                           2213
{house, drum and bass}                     885
{techno, trance}                           726
{house, techno, trance}                    411
{drum and bass, techno}                    378
{house, drum and bass, trance}             188
{drum and bass, trance}                    163
{house, techno, drum and bass}             132
{house, techno, drum and bass, trance}      67
{drum and bass, techno, trance}             35
Name: count, dtype: int64

In [77]:
# finding the columns with null entries
null_columns = df4.columns[df4.isnull().any()]

In [78]:
df4[null_columns].info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 37014 entries, 0 to 9411
Data columns (total 1848 columns):
 #     Column                                          Non-Null Count  Dtype  
---    ------                                          --------------  -----  
 0     tonal.key_strength                              36953 non-null  float64
 1     tonal.key_scale                                 36953 non-null  object 
 2     tonal.key_key                                   36953 non-null  object 
 3     rhythm.bpm_histogram_second_peak_weight.var     36953 non-null  float64
 4     rhythm.bpm_histogram_second_peak_weight.min     36953 non-null  float64
 5     rhythm.bpm_histogram_second_peak_weight.median  36953 non-null  float64
 6     rhythm.bpm_histogram_second_peak_weight.mean    36953 non-null  float64
 7     rhythm.bpm_histogram_second_peak_weight.max     36953 non-null  float64
 8     rhythm.bpm_histogram_second_peak_weight.dvar2   36953 non-null  float64
 9     rhythm.bpm_histogra

In [79]:
# dropping rows with nan values in 'tonal.key_strength'
# upon inspection, this is only 61 rows or 0.17% of the total data

df5 = df4.dropna(subset=['tonal.key_strength'])

In [80]:
# this results in empty columns, so we will also drop those
df6 = df5.dropna(axis=1, how='all')

In [81]:
# Removing uncontentious features manually
remove_cols = ['mbdata.video', 'mbdata.isrcs', 'mbdata.disambiguation', 'mbdata.score',
               'mbdata.length', # this feature has a few missing values
               'mbdata.first-release-date',
               'mbdata.tags' # all info here is now stored in mbdata.all-tags
               ]
df7 = df6.drop(columns=remove_cols)

In [82]:
# these are the non-numeric features.  To remove duplicates columns via transpose + drop_duplicates,
# we need to temporarily drop these since they are nonhashable

object_cols = ['tonal.key_scale', 'tonal.key_key', 'tonal.chords_scale', 'tonal.chords_key',
                'mbdata.id', 'mbdata.title', 'mbdata.artist-credit', 'mbdata.releases',
                'mbdata.artist-name', 'mbdata.artist-id',
                'mbdata.all-tags', 'mbdata.genre',
                ]

# removing duplicate columns (occurs for cov and icov features)
df8 = df7.drop(columns=object_cols).T.drop_duplicates().T
# removing features which have all the same value for all data points (e.g. silence_rate20dB.min)
df8 = df8.loc[:, df8.nunique() > 1]

In [83]:
# restoring the object columns
df8 = pd.concat([df8, df7[object_cols]], axis=1)

In [84]:
# removing data with null entries.  At this point, that's just one recording.
df8 = df8.dropna(axis=0)

In [86]:
# removing songs which have all four tags, as these are likely inaccurate
df9 = df8[df8['mbdata.genre'] != {'house','techno','trance', 'drum and bass'}]

In [92]:
df9.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36829 entries, 0 to 9411
Columns: 2310 entries, tonal.tuning_nontempered_energy_ratio to mbdata.dnb
dtypes: bool(4), float64(2294), object(12)
memory usage: 648.4+ MB


In [87]:
# one-hot encoding the four genres
df9['mbdata.techno'] = df9['mbdata.genre'].apply(lambda x: 'techno' in x)
df9['mbdata.house'] = df9['mbdata.genre'].apply(lambda x: 'house' in x)
df9['mbdata.trance'] = df9['mbdata.genre'].apply(lambda x: 'trance' in x)
df9['mbdata.dnb'] = df9['mbdata.genre'].apply(lambda x: 'drum and bass' in x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df9['mbdata.techno'] = df9['mbdata.genre'].apply(lambda x: 'techno' in x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df9['mbdata.house'] = df9['mbdata.genre'].apply(lambda x: 'house' in x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df9['mbdata.trance'] = df9['mbdata.genre'].apply(lambda x: 

In [88]:
# resetting the index 
df10 = df9.reset_index()

In [89]:
# lastly, we remove other features which are non-numeric or not relevant at this point
drop_cols = ['tonal.key_scale', 'tonal.key_key', 'tonal.chords_scale', 'tonal.chords_key',
             # the above features are non-numeric
             'mbdata.artist-credit', # use mbdata.artist-name and mbdata.artist-id instead
             'mbdata.releases', # use mbdata.title instead
             'index' # leftover index
             ]

cleaned_electronic_df = df10.drop(columns=drop_cols)

In [90]:
cleaned_electronic_df.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36829 entries, 0 to 36828
Data columns (total 2304 columns):
 #     Column                                           Non-Null Count  Dtype  
---    ------                                           --------------  -----  
 0     tonal.tuning_nontempered_energy_ratio            36829 non-null  float64
 1     tonal.tuning_frequency                           36829 non-null  float64
 2     tonal.tuning_equal_tempered_deviation            36829 non-null  float64
 3     tonal.tuning_diatonic_strength                   36829 non-null  float64
 4     tonal.thpcp.35                                   36829 non-null  float64
 5     tonal.thpcp.34                                   36829 non-null  float64
 6     tonal.thpcp.33                                   36829 non-null  float64
 7     tonal.thpcp.32                                   36829 non-null  float64
 8     tonal.thpcp.31                                   36829 non-null  float64
 9     to

In [93]:
cleaned_electronic_df.to_pickle('cleaned_electronic_df.pkl')

In [94]:
cleaned_electronic_df.to_csv('cleaned_electronic_df.csv')