In [1]:
import pandas as pd
import json
import os
import pickle
from dataset_build_helpers import *
from data_cleaning_helpers import *

In [9]:
## This creates separate Dataframes for each folder.
# NOTE: Do NOT run this unless you want to recreate the DataFrames.  
# Instead, just load the electronc_df.pkl file.  It's quicker.
%%script false --no-raise-error
# Setting folder paths
trance_folder_path = '../TestData/electronic/trance'
techno_folder_path = '../TestData/electronic/techno'
dnb_folder_path = '../TestData/electronic/drum_and_bass'
house_folder_path = '../TestData/electronic/house'

# obtaining list of DataFrames from each folder
trance_dfs = json_to_df_list(trance_folder_path)
techno_dfs = json_to_df_list(techno_folder_path)
dnb_dfs = json_to_df_list(dnb_folder_path)
house_dfs = json_to_df_list(house_folder_path)

UsageError: Line magic function `%%script` not found.


In [None]:
# Concatenating the DataFrames.  Again, you do not need to run this.
%%script false --no-raise-error

trance_df = pd.concat(trance_dfs, ignore_index=True)
techno_df = pd.concat(techno_dfs, ignore_index=True)
dnb_df = pd.concat(dnb_dfs, ignore_index=True)
house_df = pd.concat(house_dfs, ignore_index=True)
electronic_df = pd.concat([techno_df, dnb_df, house_df, trance_df], ignore_index=True)

electronic_df.to_pickle('electronic_df.pkl')

In [2]:
# electronic_df is a DataFrame containing all the original data from the electronic folder
# NOTE: This takes about ~30seconds to load running on 16GB of RAM

with open('electronic_df.pkl', 'rb') as file:
    electronic_df = pickle.load(file)

In [3]:
# Creating a copy of electronic_df to clean.
df = electronic_df.copy(deep=True)

In [4]:
# this dataframe has the metadata features removed
metadata_feat_names = [feat_name for feat_name in electronic_df.keys() if feat_name.startswith('metadata')]
df1 = df.drop(columns=metadata_feat_names)
# dropping duplicate mbid
df2 = df1.drop_duplicates(subset=['mbdata.id'])
# removing any potential audiobooks
df3 = remove_phrase(df2, 'audiobook')

In [5]:
# creating new columns for artist name and id
df3['mbdata.artist-name'] = df3['mbdata.artist-credit'].apply(lambda x: x[0]['artist']['name'])
df3['mbdata.artist-id'] = df3['mbdata.artist-credit'].apply(lambda x: x[0]['artist']['id'])
# normalizing capitalization
df3['mbdata.title'] = df3['mbdata.title'].apply(lambda x: x.lower())

In [16]:
# dropping duplicates by title + artist-id
df4 = df3.drop_duplicates(subset=['mbdata.title', 'mbdata.artist-id'], keep='first')
# spreading out tags into a single set
df4['mbdata.all-tags'] = df4['mbdata.tags'].apply(lambda x: genre_extractor(x))
# replacing '&' with 'and' in mbdata.all-tags, mainly to account for genre 'drum & bass'
df4['mbdata.all-tags'] = df4['mbdata.all-tags'].apply(lambda x: setstring_replace(x, '&', 'and'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['mbdata.all-tags'] = df4['mbdata.tags'].apply(lambda x: genre_extractor(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['mbdata.all-tags'] = df4['mbdata.all-tags'].apply(lambda x: setstring_replace(x, '&', 'and'))


In [8]:
# this gives you a set of all tags which appear
# this is for exploratory purposes
%%script false --no-raise-error

all_tags = set()
X = df4['mbdata.all-tags'].values

N = len(X)
all_tags = X[0]
for i in range(N):
    all_tags = all_tags.union(X[i])

#all_tags

UsageError: Line magic function `%%script` not found.


In [17]:
# Creating the final genre feature.  Based on the original tags,
# this attempts to label which genre(s) the recording falls into
# among house, drum and bass, techno, and trance

df4['mbdata.genre'] = df4['mbdata.all-tags'].apply(genre_labeler)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df4['mbdata.genre'] = df4['mbdata.all-tags'].apply(genre_labeler)


In [20]:
df4['mbdata.genre'].value_counts()

mbdata.genre
{house}                                   8776
{drum and bass}                           7121
{techno}                                  7092
{trance}                                  6547
{trance, house}                           2259
{house, techno}                           2227
{drum and bass, house}                     890
{trance, techno}                           742
{trance, house, techno}                    400
{drum and bass, techno}                    379
{drum and bass, trance, house}             199
{drum and bass, trance}                    166
{drum and bass, house, techno}             134
{drum and bass, trance, house, techno}      70
{drum and bass, trance, techno}             35
Name: count, dtype: int64