# exploring genre and sub-genre data from the Free Music Archive

In [1]:
import pandas as pd
from pathlib import Path
import eyed3
import os
from config import *

eyed3.log.setLevel("ERROR")

### notebook functions

In [2]:
def make_file_list(root_dir):
    '''given a directory, loops through folders and files and return list of files paths
        and file names'''
    path_list = []
    for subdir, dirs, files in os.walk(root_dir):
        for file in files:
            path_list.append(os.path.join(subdir, file))

    return path_list

def get_genre_id(genre_dict_list):
    return genre_dict_list[0]['genre_id']

def id_to_int(val):
    return int(val)

### import data from targeted directory

In [3]:
metadata = z_path + "\\fma_metadata"
os.chdir(metadata)

genres_df = pd.read_csv('genres.csv')
tracks_df = pd.read_csv('raw_tracks.csv')

### make dataframes and adjustments

In [4]:
# drop rows missing the 'genre' value
tracks_df = tracks_df[tracks_df['track_genres'].notna()]

# string conversion to list of genre dictionaries
tracks_df['track_genres'] = tracks_df['track_genres'].apply(eval)

# assign primary genre classification id, then make an int
tracks_df['genre_ids'] = tracks_df['track_genres'].apply(get_genre_id)

# convert IDs to ints for comparisons
tracks_df['genre_ids'] = tracks_df['genre_ids'].apply(id_to_int)
tracks_df['track_id'] = tracks_df['track_id'].apply(id_to_int)

### dictionaries for lookups

In [5]:
# dictionaries for top-level genre lookup from ID values
genre_lookup = genres_df.set_index('genre_id').to_dict()['title']
id_lookup = genres_df.set_index('title').to_dict()['genre_id']

# if we ever want everything under only a few genres
# key = current genre, value = parent genre
parent_lookup = genres_df.set_index('genre_id').to_dict()['top_level']

In [6]:
# all genres for songs
genre_ids = genres_df['genre_id'].to_list()
all_genres = genres_df['title'].to_list()

# when a genre has no parent, it is top-level
top_genres = genres_df[genres_df['parent'] == 0]['title'].to_list()

In [7]:
# make dict of top genres
genre_children = {}

for genre in top_genres:
    genre_children[genre] = []

# add child genres to lists in dict
for genre_id in genre_ids:
    genre = genre_lookup[genre_id]
    parent = genre_lookup[parent_lookup[genre_id]]
    genre_children[parent].append(genre)

In [8]:
for genre in top_genres:
    print(genre, ": ", genre_children[genre], "\n")

International :  ['International', 'Latin America', 'French', 'Reggae - Dub', 'Afrobeat', 'Indian', 'African', 'Middle East', 'Polka', 'Balkan', 'Europe', 'Brazilian', 'Asia-Far East', 'N. Indian Traditional', 'South Indian Traditional', 'Bollywood', 'Pacific', 'Celtic', 'North African', 'Flamenco', 'Cumbia', 'Latin', 'Romany (Gypsy)', 'Reggae - Dancehall', 'Spanish', 'Klezmer', 'Salsa', 'Turkish', 'Tango', 'Fado'] 

Blues :  ['Blues', 'Gospel'] 

Jazz :  ['Jazz', 'Jazz: Vocal', 'Free-Jazz', 'Jazz: Out', 'Be-Bop', 'Big Band/Swing', 'Modern Jazz'] 

Classical :  ['Classical', '20th Century Classical', 'Composed Music', 'Opera', 'Chamber Music', 'Choral Music', 'Symphony', 'Contemporary Classical'] 

Old-Time / Historic :  ['Old-Time / Historic'] 

Country :  ['Country', 'Bluegrass', 'Americana', 'Rockabilly', 'Western Swing', 'Country & Western'] 

Pop :  ['Pop', 'Experimental Pop', 'Synth Pop'] 

Rock :  ['Rock', 'Punk', 'Post-Rock', 'Lo-Fi', 'Metal', 'Krautrock', 'Loud-Rock', 'Noise-R

In [9]:
# get songs to count genres
root_dir = z_path + "\\genres"

song_paths = make_file_list(root_dir)

In [10]:
# dict for number of songs under each genre from sample set
genre_counts = {}
for genre in all_genres:
    genre_counts[genre] = 0

top_counts = {}
for genre in top_genres:
    top_counts[genre] = 0

# start counting genres
for i in range(len(song_paths)):
    # load audio file
    audiofile = eyed3.load(song_paths[i])
    filename = song_paths[i][-10:]

    # get genre and strip genre ID
    song_genre = str(audiofile.tag.genre)
    # will either be 0 or idx after (id)
    end = song_genre.find(')') + 1
    # for Romany(gypsy) genre
    if end == len(song_genre):
        end = 0
    song_genre = song_genre[end:]

    # track number of songs for each genre
    genre_counts[song_genre] += 1
    # change to top-level genre
    song_genre = genre_lookup[parent_lookup[id_lookup[song_genre]]]
    top_counts[song_genre] += 1

In [12]:
# just get genres that are present in sample
non_zero_genres = [(key, value) for key, value in genre_counts.items() if int(value) > 0]

total = 0
for g in non_zero_genres:
    total += g[1]
print(total)
# sort in descending order
non_zero_genres.sort(key=lambda x: x[1], reverse=True)
print(non_zero_genres)

for genre in top_counts:
    print(genre,":", top_counts[genre])

25000
[('Electronic', 3855), ('Rock', 2406), ('Hip-Hop', 1957), ('Punk', 1401), ('Folk', 1155), ('Soundtrack', 773), ('Pop', 653), ('Experimental', 647), ('Avant-Garde', 559), ('Indie-Rock', 556), ('Ambient Electronic', 539), ('Old-Time / Historic', 510), ('Classical', 448), ('Experimental Pop', 446), ('Psych-Rock', 414), ('Chiptune', 398), ('Noise', 379), ('Lo-Fi', 335), ('International', 328), ('Trip-Hop', 323), ('Jazz', 286), ('Metal', 279), ('Post-Rock', 266), ('Ambient', 249), ('Techno', 236), ('Garage', 234), ('Reggae - Dub', 208), ('Instrumental', 208), ('Post-Punk', 183), ('Chip Music', 182), ('Singer-Songwriter', 178), ('Glitch', 146), ('Field Recordings', 144), ('IDM', 137), ('Hardcore', 135), ('House', 133), ('Industrial', 130), ('Noise-Rock', 124), ('Psych-Folk', 113), ('Audio Collage', 104), ('New Wave', 101), ('Hip-Hop Beats', 90), ('Country', 89), ('Synth Pop', 87), ('Disco', 86), ('Latin America', 86), ('Balkan', 83), ('Sound Collage', 83), ('Electro-Punk', 75), ('Blues