In [49]:
import sys, os, re, gzip

from pydub.utils import mediainfo

In [2]:
data_path = '../data'
mp3s_path = '%s/mp3s' % data_path
song_data_file = '%s/mp3com_html_analysis_output.txt' % data_path

#create this cleaned metadata file
metadata_path = '%s/metadata.txt.gz' % data_path

In [3]:
#
# Parse song file - extract and clean select fields
#

In [24]:
def parse_metadata_file(file_path, out_path):
    #Headers and record example:
    #File Name	Artist Name	Song Name	Genre	Comment	CD	Label	Credits	Download Link	Mp3 Filename
    #.\artist_song\0\249.html	 Beefchow	Keep on Dancin (original)	Club	Dance (Pretty upbeat and catchy)	Tekknotrancemissions		DJ Beefchow	http://play.mp3.com/cgi-bin/play/play.cgi/AAIAQvkAAADABG5vcm1QGAAAAFL5AAAAUQEAAABDwienPtGPy4rQmW9N6pQj_gKeqQs-/Keep_on_Dancin_origina.mp3	Keep_on_Dancin_origina.mp3
    lines = []
    ttl=0
    field_cts = {'genre':0, 'artist_name':0, 'album_name':0, 'song_name':0, 'comment':0, 'mp3_filename':0}

    with open(file_path, encoding='ISO-8859-1') as f:
        headers = f.readline().replace('\n','')
        
        #clean up headers
        headers = headers.replace('CD', 'album_name')
        headers = [h.lower().replace(' ','_') for h in headers.split('\t')]
        
        #select only the headers we're interested in
        col_idx = [headers.index(field) for field in field_cts.keys()]
        
        headers = [headers[i] for i in col_idx]
        
        lines.append('ID\t%s' % '\t'.join(headers))
        print(headers)

        for i,line in enumerate(f):
            items = line.replace('\n','').split('\t')

            if not len(items)==10:
                continue
                
            #no mp3 name
            if not items[-1]:
                continue

            ttl+=1

            items = [items[j].strip() for j in col_idx]

            #'mp3_filename', 'genre', 'artist_name', 'album_name', 'song_name', 'comment'
            if items[0]:
                field_cts['mp3_filename']+=1
            if items[1]:
                field_cts['genre']+=1
            if items[2]:
                field_cts['artist_name']+=1
            if items[3]:
                field_cts['album_name']+=1
            if items[4]:
                field_cts['song_name']+=1
            if items[5]:
                field_cts['comment']+=1

            lines.append('%d\t%s' % (i, '\t'.join(items)))
            
            if i and i%50000==0:
                print(i, field_cts, '\n', items)
               
    print(ttl, field_cts, [(k,v/ttl) for k,v in field_cts.items()])
    
    with gzip.open(out_path, 'wt', encoding='utf-8') as oz:
        for i, line in enumerate(lines):
            oz.write('%s\n' % (line))

In [25]:
parse_metadata_file(song_data_file, metadata_path)

['genre', 'artist_name', 'album_name', 'song_name', 'comment', 'mp3_filename']
50000 {'genre': 43205, 'artist_name': 25840, 'album_name': 43204, 'song_name': 31307, 'comment': 43205, 'mp3_filename': 43205} 
 ['Experimental/Post Rock', 'WarpBoy', '', 'Techether', 'Please read the lyrics and then relisten with headphones to get the full experience. And yes the background music sounds crappy. The quality on the music was intentionnaly degraded with a low pass filter and copied to tape and back to digital for artistic purposes.', 'Techether.mp3']
100000 {'genre': 85523, 'artist_name': 50085, 'album_name': 85523, 'song_name': 62086, 'comment': 85524, 'mp3_filename': 85524} 
 ['Experimental', 'DJ  Tokkin1', '', 'trippin out the beat', 'another song made after tokin a mighty tighty whitey', 'trippin_out_the_beat.mp3']
150000 {'genre': 126950, 'artist_name': 74983, 'album_name': 126955, 'song_name': 91458, 'comment': 126956, 'mp3_filename': 126956} 
 ['Satire', 'Mitch &amp; Max Productions', '

In [None]:
#
# Try matching unique file names, extract metadata from unmatched files
#

In [31]:
def get_file_names(metadata_path):
    mp3s_metadata = {}
    field_idx = {}
    file_names = set()
    dup_file_names = set()
    
    with gzip.open(metadata_path, 'rt', encoding='utf-8') as fz:
        for i, line in enumerate(fz):
            line=line.replace('\n','')
            
            if i==0:
                headers = line.split('\t')
                field_idx = {h:j for j,h in enumerate(headers)}
                continue
                
            items = line.split('\t')
            if not len(items)==len(field_idx):
                print(i, items)
                continue
                
            mp3_metadata = {}
            for h,j in field_idx.items():
                mp3_metadata[h] = items[j]
                
            file_name = mp3_metadata['mp3_filename']
            if file_name in file_names:
                dup_file_names.add(file_name)
            else:
                file_names.add(file_name)
                
            mp3s_metadata[mp3_metadata['ID']] = mp3_metadata
            
    mp3s_metadata_dup_names = {k:v for k,v in mp3s_metadata.items() if v['mp3_filename'] in dup_file_names}
    mp3s_metadata_uniq_names = {k:v for k,v in mp3s_metadata.items() if v['mp3_filename'] not in dup_file_names}
    
    return mp3s_metadata_dup_names, mp3s_metadata_uniq_names

In [32]:
mp3s_metadata_dup_names, mp3s_metadata_uniq_names = get_file_names(metadata_path)
len(uniq_names), len(dup_names)

(397801, 40334)

In [41]:
def find_ambiguous_files(mp3s_path, mp3s_metadata_uniq_names, mp3s_metadata_dup_names):
    uniq_names = set([v['mp3_filename'] for v in mp3s_metadata_uniq_names.values()])
    dup_names = set([v['mp3_filename'] for v in mp3s_metadata_dup_names.values()])

    found, dups, unknown = set(), set(), set()

    for fn in os.listdir(mp3s_path):
        if not fn.endswith('.mp3'):
            continue
        if fn in uniq_names:
            found.add(fn)
        elif fn in dup_names:
            dups.add(fn)
        else:
            unknown.add(fn)

    print('Found: %d, Known Dup: %d, Unknown: %d' % (len(found), len(dups), len(unknown)))
    
    return found, dups, unknown

In [42]:
found, dups, unknown = find_ambiguous_files(mp3s_path, mp3s_metadata_uniq_names, mp3s_metadata_dup_names)

8 7 1


In [53]:
def try_disambiguate(dups, unknown, mp3s_path, mp3s_metadata_dup_names):
    metadata_mp3_name_to_idx = {}
    dup_mp3_and_artist = set()
    i=0
    for fn in os.listdir(mp3s_path):
        if not fn.endswith('.mp3'):
            continue
        if fn not in dups and fn not in unknown:
            continue
            
        i+=1
        if i%1000==0:
            print(i, len(metadata_mp3_name_to_idx))
            
        fp = '%s/%s' % (mp3s_path, fn)
        
        file_name = fn
        if fn in unknown and re.match('.+ \([0-9]+\)\.mp3', fn):
            file_name = re.sub(' \([0-9]+\)\.mp3', '.mp3', fn)
        
        metadata = mediainfo(fp).get('TAG', None)
        #metadata = {'encoder': 'LAME3.92 ', 'title': 'Believe', 'artist': 'DREAMTRONIX', 
        #            'comment': 'http://www.mp3.com/DREAMTRONIX','genre': 'Blues'}
        
        artist = metadata['artist'].lower() if 'artist' in metadata else ''
        #genre often differs between the metadata and song list
        #genre = metadata['genre'].lower() if 'genre' in metadata else ''
        
        for idx, rec in mp3s_metadata_dup_names.items():
            if rec['mp3_filename']==file_name:
                if artist == rec['artist_name'].lower():
                    if fn in metadata_mp3_name_to_idx:
                        dup_mp3_and_artist.add((fn, file_name, artist))
                    else:
                        metadata_mp3_name_to_idx[fn] = idx
                    
    print(len(metadata_mp3_name_to_idx), len(dup_mp3_and_artist))
    
    return metadata_mp3_name_to_idx, dup_mp3_and_artist

In [54]:
metadata_mp3_name_to_idx, dup_mp3_and_artist = try_disambiguate(dups, unknown, mp3s_path, mp3s_metadata_dup_names)

8 1


In [55]:
metadata_mp3_name_to_idx, dup_mp3_and_artist

({'Beautiful_Sky.mp3': '383823',
  'Beethoven_Sonata_No_8_.mp3': '4568',
  'BEINGS_FROM_A_STRANGE_.mp3': '548756',
  'Believe (4).mp3': '518192',
  'Best_Friend.mp3': '609387',
  'Better_Life.mp3': '233945',
  'Be_Glorified.mp3': '538610',
  'Be_With_Me.mp3': '457550'},
 {('BEINGS_FROM_A_STRANGE_.mp3',
   'BEINGS_FROM_A_STRANGE_.mp3',
   'the serenity project')})