In [1]:
import pandas as pd
from pathlib import Path
import eyed3
import os
from config import *

eyed3.log.setLevel("ERROR")

notebook_path = os.path.abspath("./")


In [2]:
def make_file_list(root_dir):
    '''given a directory, loops through folders and files and return list of files paths
        and file names'''
    path_list = []
    for subdir, dirs, files in os.walk(root_dir):
        path_list = [os.path.join(subdir, file) for file in files if file.endswith(('.mp3', '.wav'))]

    return path_list

def get_genre_id(genre_dict_list):
    return genre_dict_list[0]['genre_id']

def id_to_int(val):
    return int(val)

In [17]:
metadata = notebook_path + "/Katie/fma_metadata"
os.chdir(metadata)
!pwd

genres_df = pd.read_csv('genres.csv')
tracks_df = pd.read_csv('raw_tracks.csv')

/Users/katie/code/capstone/Music_Genre_Classification/Katie/fma_metadata


In [18]:
# drop rows missing the 'genre' value
tracks_df = tracks_df[tracks_df['track_genres'].notna()]

# string conversion to list of genre dictionaries
tracks_df['track_genres'] = tracks_df['track_genres'].apply(eval)

# assign primary genre classification id, then make an int
tracks_df['genre_ids'] = tracks_df['track_genres'].apply(get_genre_id)

# convert IDs to ints for comparisons
tracks_df['genre_ids'] = tracks_df['genre_ids'].apply(id_to_int)
tracks_df['track_id'] = tracks_df['track_id'].apply(id_to_int)

In [19]:
# dictionaries for top-level genre lookup from ID values
genre_lookup = genres_df.set_index('genre_id').to_dict()['title']
id_lookup = genres_df.set_index('title').to_dict()['genre_id']

# if we ever want everything under only a few genres
# key = current genre, value = parent genre
parent_lookup = genres_df.set_index('genre_id').to_dict()['top_level']

In [20]:
# all genres for songs
genre_ids = genres_df['genre_id'].to_list()
all_genres = genres_df['title'].to_list()

# when a genre has no parent, it is top-level
top_genres = genres_df[genres_df['parent'] == 0]['title'].to_list()

In [21]:
# make dict of top genres
genre_children = {}

for genre in top_genres:
    genre_children[genre] = []

# add child genres to lists in dict
for genre_id in genre_ids:
    genre = genre_lookup[genre_id]
    parent = genre_lookup[parent_lookup[genre_id]]
    genre_children[parent].append(genre)

# print(genre_children)

{'International': ['International', 'Latin America', 'French', 'Reggae - Dub', 'Afrobeat', 'Indian', 'African', 'Middle East', 'Polka', 'Balkan', 'Europe', 'Brazilian', 'Asia-Far East', 'N. Indian Traditional', 'South Indian Traditional', 'Bollywood', 'Pacific', 'Celtic', 'North African', 'Flamenco', 'Cumbia', 'Latin', 'Romany (Gypsy)', 'Reggae - Dancehall', 'Spanish', 'Klezmer', 'Salsa', 'Turkish', 'Tango', 'Fado'], 'Blues': ['Blues', 'Gospel'], 'Jazz': ['Jazz', 'Jazz: Vocal', 'Free-Jazz', 'Jazz: Out', 'Be-Bop', 'Big Band/Swing', 'Modern Jazz'], 'Classical': ['Classical', '20th Century Classical', 'Composed Music', 'Opera', 'Chamber Music', 'Choral Music', 'Symphony', 'Contemporary Classical'], 'Old-Time / Historic': ['Old-Time / Historic'], 'Country': ['Country', 'Bluegrass', 'Americana', 'Rockabilly', 'Western Swing', 'Country & Western'], 'Pop': ['Pop', 'Experimental Pop', 'Synth Pop'], 'Rock': ['Rock', 'Punk', 'Post-Rock', 'Lo-Fi', 'Metal', 'Krautrock', 'Loud-Rock', 'Noise-Rock', 

In [8]:
for genre in top_genres:
    print(genre, ": ", genre_children[genre], "\n")

International :  ['International', 'Latin America', 'French', 'Reggae - Dub', 'Afrobeat', 'Indian', 'African', 'Middle East', 'Polka', 'Balkan', 'Europe', 'Brazilian', 'Asia-Far East', 'N. Indian Traditional', 'South Indian Traditional', 'Bollywood', 'Pacific', 'Celtic', 'North African', 'Flamenco', 'Cumbia', 'Latin', 'Romany (Gypsy)', 'Reggae - Dancehall', 'Spanish', 'Klezmer', 'Salsa', 'Turkish', 'Tango', 'Fado'] 

Blues :  ['Blues', 'Gospel'] 

Jazz :  ['Jazz', 'Jazz: Vocal', 'Free-Jazz', 'Jazz: Out', 'Be-Bop', 'Big Band/Swing', 'Modern Jazz'] 

Classical :  ['Classical', '20th Century Classical', 'Composed Music', 'Opera', 'Chamber Music', 'Choral Music', 'Symphony', 'Contemporary Classical'] 

Old-Time / Historic :  ['Old-Time / Historic'] 

Country :  ['Country', 'Bluegrass', 'Americana', 'Rockabilly', 'Western Swing', 'Country & Western'] 

Pop :  ['Pop', 'Experimental Pop', 'Synth Pop'] 

Rock :  ['Rock', 'Punk', 'Post-Rock', 'Lo-Fi', 'Metal', 'Krautrock', 'Loud-Rock', 'Noise-R

In [24]:
# get songs to count genres
root_dir = notebook_path + "/genres"
print(root_dir)
song_paths = make_file_list(root_dir)
print(song_paths)

/Users/katie/code/capstone/Music_Genre_Classification\genres
[]


In [15]:
# dict for number of songs under each genre from sample set
genre_counts = {}
for genre in all_genres:
    genre_counts[genre] = 0

top_counts = {}
for genre in top_genres:
    top_counts[genre] = 0

# start counting genres
for i in range(len(song_paths)):
    # load audio file
    audiofile = eyed3.load(song_paths[i])
    filename = song_paths[i][-10:]

    # get genre and strip genre ID
    song_genre = str(audiofile.tag.genre)
    # will either be 0 or idx after (id)
    end = song_genre.find(')') + 1
    # for Romany(gypsy) genre
    if end == len(song_genre):
        end = 0
    song_genre = song_genre[end:]

    # track number of songs for each genre
    genre_counts[song_genre] += 1
    # change to top-level genre
    song_genre = genre_lookup[parent_lookup[id_lookup[song_genre]]]
    top_counts[song_genre] += 1
print(genre_counts)

{'Avant-Garde': 0, 'International': 0, 'Blues': 0, 'Jazz': 0, 'Classical': 0, 'Novelty': 0, 'Comedy': 0, 'Old-Time / Historic': 0, 'Country': 0, 'Pop': 0, 'Disco': 0, 'Rock': 0, 'Easy Listening': 0, 'Soul-RnB': 0, 'Electronic': 0, 'Sound Effects': 0, 'Folk': 0, 'Soundtrack': 0, 'Funk': 0, 'Spoken': 0, 'Hip-Hop': 0, 'Audio Collage': 0, 'Punk': 0, 'Post-Rock': 0, 'Lo-Fi': 0, 'Field Recordings': 0, 'Metal': 0, 'Noise': 0, 'Psych-Folk': 0, 'Krautrock': 0, 'Jazz: Vocal': 0, 'Experimental': 0, 'Electroacoustic': 0, 'Ambient Electronic': 0, 'Radio Art': 0, 'Loud-Rock': 0, 'Latin America': 0, 'Drone': 0, 'Free-Folk': 0, 'Noise-Rock': 0, 'Psych-Rock': 0, 'Bluegrass': 0, 'Electro-Punk': 0, 'Radio': 0, 'Indie-Rock': 0, 'Industrial': 0, 'No Wave': 0, 'Free-Jazz': 0, 'Experimental Pop': 0, 'French': 0, 'Reggae - Dub': 0, 'Afrobeat': 0, 'Nerdcore': 0, 'Garage': 0, 'Indian': 0, 'New Wave': 0, 'Post-Punk': 0, 'Sludge': 0, 'African': 0, 'Freak-Folk': 0, 'Jazz: Out': 0, 'Progressive': 0, 'Alternative Hi

In [14]:
# just get genres that are present in sample
non_zero_genres = [(key, value) for key, value in genre_counts.items() if int(value) > 0]

total = 0
for g in non_zero_genres:
    total += g[1]
print(total)
# sort in descending order
non_zero_genres.sort(key=lambda x: x[1], reverse=True)
print(non_zero_genres)

for genre in top_counts:
    print(genre,":", top_counts[genre])

0
[]
International : 0
Blues : 0
Jazz : 0
Classical : 0
Old-Time / Historic : 0
Country : 0
Pop : 0
Rock : 0
Easy Listening : 0
Soul-RnB : 0
Electronic : 0
Folk : 0
Spoken : 0
Hip-Hop : 0
Experimental : 0
Instrumental : 0
