In [29]:
import xml.etree.ElementTree as ET
import json
import gzip
from tqdm import tqdm
import hashlib

def hash_track(artist, title):
    return hashlib.md5(f"{artist.lower()} - {title.lower()}".encode()).hexdigest()

def match_discogs_id(xml_gz_file, json_data):

    track_dict = {}
    for show in json_data:
        for track in json_data[show]['tracks']:
            track_hash = hash_track(track['artist'], track['title'])
            track_dict[track_hash] = track


    with gzip.open(xml_gz_file, 'rt') as f:
        for event, elem in tqdm(ET.iterparse(f, events=('end',)), desc="Matching Discogs IDs"):
            if elem.tag == 'release':
                xml_artist = elem.find('artists/artist/name').text
                discogs_id = elem.get('id')

                # Get the list of track titles in the XML file
                xml_track_titles = [track.find('title').text for track in elem.findall('tracklist/track')]
                
                # Iterate over the xml_track_titles and check if the hash is in track_dict
                for title in xml_track_titles: 
                    if xml_artist is None or title is None:
                        continue
                    track_hash = hash_track(xml_artist, title)
                    if track_hash in track_dict:
                        track_dict[track_hash]['discogs_id'] = discogs_id

                elem.clear()  # free up memory

    # return updated json_data
    return track_dict

# Load json data
with open('../data/nts_29_7_23_tracklists.json', 'r') as f:
    data = json.load(f)

# Update json data with discogs_id
track_dict = match_discogs_id('/Users/barneyh/Downloads/discogs_20230601_releases.xml.gz', data)

Matching Discogs IDs: 2729598337it [33:26, 1360226.13it/s]


In [33]:
for show in data.values():
    for track in show['tracks']:
        track_id = track['uid']
        if track_id in track_dict and 'discogs_id' in track_dict[track_id]:
            track['discogs_id'] = track_dict[track_id]['discogs_id']


with open('../data/nts_29_7_23_w_discogs_tracklists.json', 'w') as outfile:
    json.dump(data, outfile)

In [42]:
total=0
for track in track_dict.values():
    if track.get('discogs_id') != None and track['linked']:
        total+=1

print(total)

32805


In [12]:
get_release_count('/Users/barneyh/Downloads/discogs_20230601_releases.xml.gz')

5240500it [00:03, 1341695.83it/s]


KeyboardInterrupt: 

In [8]:
data

{'emotional-16th-july-2023': {'url': '/shows/emotional/episodes/emotional-16th-july-2023',
  'name': 'All Trades w/ Chuggy',
  'loc': 'London',
  'date': '16.07.23',
  'tracks': [{'uid': '439aa1df54c747e7c16038f8844b8fe0',
    'artist': 'Fredde Viadukt',
    'title': 'Marvel Of Miracles',
    'linked': False},
   {'uid': 'cbb1b70abc83b534d46fc3dae3c2ba5c',
    'artist': 'Kosmoksen Errako',
    'title': 'Jolkottelevat',
    'linked': False},
   {'uid': '55788bfa9fcf9b2f47f4f47885f23eb3',
    'artist': 'Nyati Mayi & The Astral Synth Transmitters',
    'title': 'Lolokele',
    'linked': False},
   {'uid': '967395b7ef8f0989dfb30a445bb2399e',
    'artist': 'Big Daddy',
    'title': "Cupid's Itch ",
    'linked': False},
   {'uid': '5700ec1977fef581671b5c8ffa677aa6',
    'artist': "Jade, Lil' Rachett, VAZ",
    'title': "Keep On Risin' (No Rap Version)",
    'linked': False},
   {'uid': '26408a41e0a3fd869cc6aed18c298b63',
    'artist': 'By Chance',
    'title': 'Revenge',
    'linked': False