In [2]:
import pandas as pd

import glob 
import mutagen.mp3
from mutagen.easyid3 import EasyID3
from mutagen.id3 import ID3

## Mutagen ID3 tags

[ID3](http://id3.org/id3v2.3.0) tags are a data container that standardizes the format the data is stored as.  This typically includes Title, Artist, Genre, etc.  ID3 tags can be difficult to see or understand at first glance but the Mutagen package has a convenient module called EasyID3 which outputs a more human readable format, eg "genre" instead of "TCON."

Note about EDA: After some initial exploration of the ID3 tags, I realized the genre tags were missing or incorrect.  This led to me finding that a lot of the songs that were categorized into a genre were not in any way related to that genre.  Because of this, most of the data cleaning was done in the folder on the computer because I had to listen to samples of as many songs as I could to ensure that they all were correct.  As a result I had to delete almost two-thirds of my songs and search the internet for more royalty-free music to download (and also make sure those were properly sorted).   

In [None]:
# # Changing all the genre tags to match their specific genres

# def id3_change(folder_path, ID3='TCON', text=None, file_type=".mp3"):
    
#     path = (folder_path + '/*' + file_type)
    
#     for track in glob.glob(path):
#         track = mutagen.File(track)
#         if track.tags == None:
#             add_details(track, text)
#         else:
#             track.tags.add(TCON(text=[text]))
#             track.save()

        
# #     return "All tags updated!"

In [None]:
# # Changing all the genre tags to match their specific genres

# def tag_change(folder_path, easyID3_tag='genre', text=None, file_type=".mp3"):
    
#     path = (folder_path + '/*' + file_type) 
    
#     for track in glob.glob(path):
#         track = EasyID3(track)
#         track[easyID3_tag] = text
#         audio.save()
        
#     return "All tags updated!"

In [None]:
def add_details(file_name, text, tag='genre'):
    """
    Adds metadata to an .mp3 file using mutagen's EasyID3.
    """

    tags = mutagen.mp3.EasyMP3(file_name)
    tags[tag] = text
    tags.save()

    tags = ID3(file_name)

    tags.save(file_name)

In [None]:
# Making sure all the labels for tracks are set

path1 = "D:/DnB/*.mp3"
for track in glob.glob(path1):
    add_details(track, 'DnB')
    
path2 = "D:/Dubstep/*.mp3"
for track in glob.glob(path2):
    add_details(track, 'Dubstep')
    
path3 = "D:/Garage/*.mp3"
for track in glob.glob(path3):
    add_details(track, 'Garage Rock')
    
path4 = "D:/House/*.mp3"
for track in glob.glob(path4):
    add_details(track, 'House')
    
path5 = "D:/Indie-Rock/*.mp3"
for track in glob.glob(path5):
    add_details(track, 'Indie Rock')
    
path6 = "D:/Techno/*.mp3"
for track in glob.glob(path6):
    add_details(track, 'Techno')

1389

In [140]:
tracks = [tracks for tracks in glob.glob("D:\Music\*\*.mp3")]     # Get list of all the tracks
song_list = [dict(EasyID3(tracks).items()) for tracks in tracks]  # Create a list of dictionaries for ID3 tags to create a data frame from

In [232]:
track_df = pd.DataFrame(song_list)
track_df = track_df[['title', 'genre']]  # Only need Title and Genre 

In [233]:
track_df.fillna('', inplace=True)                     # Title not necessary since I'll be adding file paths as well
track_df = track_df.astype(str)                       # All columns were lists of length 1 so converting to strings
track_df['title'] = track_df['title'].str.strip("[]") 
track_df['genre'] = track_df['genre'].str.strip("[]")
track_df.head()

Unnamed: 0,title,genre
0,'002 Donald Trump','DnB'
1,'003 Traffic Jam','DnB'
2,'004 Circles Squares','DnB'
3,'005 I Want To Lick The Moon','DnB'
4,'006 Lentil Breakdown','DnB'


In [234]:
track_df['file_path'] = tracks   # Adding column for file paths 
track_df.head()

Unnamed: 0,title,genre,file_path
0,'002 Donald Trump','DnB',D:\Music\DnB\002_Donald_Trump.mp3
1,'003 Traffic Jam','DnB',D:\Music\DnB\003_Traffic_Jam.mp3
2,'004 Circles Squares','DnB',D:\Music\DnB\004_Circles_Squares.mp3
3,'005 I Want To Lick The Moon','DnB',D:\Music\DnB\005_I_Want_To_Lick_The_Moon.mp3
4,'006 Lentil Breakdown','DnB',D:\Music\DnB\006_Lentil_Breakdown.mp3


In [236]:
track_df.isna().sum()  # Checking to make sure all nulls are dealt with

title        0
genre        0
file_path    0
dtype: int64

In [237]:
track_df.to_csv('track_df.csv', index=False)  # Only need to add MFCCs for each track

In [22]:
df = pd.read_csv('track_df.csv')
df.head()

Unnamed: 0,title,genre,file_path
0,'002 Donald Trump','DnB',D:\Music\DnB\002_Donald_Trump.mp3
1,'003 Traffic Jam','DnB',D:\Music\DnB\003_Traffic_Jam.mp3
2,'004 Circles Squares','DnB',D:\Music\DnB\004_Circles_Squares.mp3
3,'005 I Want To Lick The Moon','DnB',D:\Music\DnB\005_I_Want_To_Lick_The_Moon.mp3
4,'006 Lentil Breakdown','DnB',D:\Music\DnB\006_Lentil_Breakdown.mp3


In [23]:
df.isnull().sum()

title        64
genre         0
file_path     0
dtype: int64

In [15]:
df[df.title.isnull()]

Unnamed: 0,title,genre,file_path
251,,'Dubstep',D:\Music\Dubstep\bigctv+dubstepageofextinction...
293,,'Dubstep',D:\Music\Dubstep\djbassassin+edmman.mp3
299,,'Dubstep',D:\Music\Dubstep\dragonfruitcoughmedicine+simp...
307,,'Dubstep',D:\Music\Dubstep\flashpointe+hippiescurseii.mp3
334,,'Dubstep',D:\Music\Dubstep\lemmino+lemminofearfreedownlo...
336,,'Dubstep',D:\Music\Dubstep\lilluigi+ifnluvdubstep.mp3
364,,'Dubstep',D:\Music\Dubstep\psychometal+cry.mp3
380,,'Dubstep',D:\Music\Dubstep\roswellgrey+roswellgrey.mp3
429,,'Dubstep',D:\Music\Dubstep\sydneeb+sydneebxnoisestormsup...
434,,'Dubstep',D:\Music\Dubstep\thethirdproductions+allforces...


In [1]:
df.fillna(' ', inplace=True) # Again the title isn't as important since the file name is also there

NameError: name 'df' is not defined

In [25]:
df.isna().sum()

title        0
genre        0
file_path    0
dtype: int64

In [26]:
df.to_csv('track_df.csv', index=False)

In [27]:
test = pd.read_csv('track_df.csv')

In [30]:
test.isna().sum()  # Make sure null values were removed

title        0
genre        0
file_path    0
dtype: int64