## TODO
- create main tracklists file
   - url
   - episode
   - host
   - location
   - date
   - tracklist
- create tracks file


In [30]:
import json
import tqdm
from collections import Counter
import hashlib

tracklists = {}
track_counter = Counter()


def process(tracklist):
    processed_tracklist = []
    for track in tracklist:

        # Extracting artist names from mainArtists, featuringArtists and remixArtists
        main_artists = [artist['name'].strip() for artist in track['mainArtists']]
        featuring_artists = [artist['name'].strip() for artist in track['featuringArtists']]
        remix_artists = [artist['name'].strip() for artist in track['remixArtists']]

        # Joining artist names with commas
        main_artists_str = ', '.join(main_artists)
        featuring_artists_str = ', '.join(featuring_artists)
        remix_artists_str = ', '.join(remix_artists)

        # Build the complete artist string
        artists = main_artists_str
        if featuring_artists_str:
            artists += f" feat. {featuring_artists_str}"
        if remix_artists_str:
            artists += f" (Remix by {remix_artists_str})"

        # Create a new uid from the lowercased artist-title string
        uid = hashlib.md5(f"{main_artists_str.lower().strip()} - {track['title'].lower().strip()}".encode()).hexdigest()

        processed_track = {
            "uid": uid,
            "artist": artists.strip(),  # Remove trailing and leading whitespace
            "title": track["title"].strip(),  # Remove trailing and leading whitespace
        }

        # Add to the track counter
        track_counter[uid] += 1

        processed_tracklist.append(processed_track)

    return processed_tracklist


with open('../data/tracklists_json.txt') as links_file:
    for line in tqdm.tqdm(links_file):
        try:
            tracklist = json.loads(line)
            processed_tracklist = process(tracklist["episode"]["tracklist"])
            if processed_tracklist == []: continue
            tracklists[tracklist["episode"]["episode_alias"]] = {
                "url": tracklist["episode"]["path"],
                "name": tracklist["episode"]["name"],
                "loc": tracklist["episode"]["location_long"],
                "date": tracklist["episode"]["broadcast_formatted"],
                "tracks": processed_tracklist
            }
        except:
            continue

track_counter[None] = 0
track_counter["c43ed6d45f71901bc427779cfb9a0e04"] = 0 # This is the "unknown" track
track_counter["1e5c266b1b0067ca5318232558877d95"] = 0 # This is the "unknown" track
track_counter["36015401c28957cd3e0b2f7b81c14ef7"] = 0 # This is the "unknown" track
track_counter["3d082f8f1a32b953f8280734faee68ef"] = 0 # This is the "unknown" track
track_counter["8d01989131b1a2b1b0cbef7fe6806b83"] = 0 # This is the "unknown" track
track_counter["f2862a0f3882db4e3d47f38ed6b1559f"] = 0
track_counter["54be74ea7f96362d909e899c72d1cf45"] = 0


# Now go through the processed tracklists and add "linked" flag
for tracklist in tracklists.values():
    for track in tracklist["tracks"]:
        if track_counter[track["uid"]] > 1 and track["uid"] != None:
            track["linked"] = True
        else:
            track["linked"] = False

with open('../data/nts_29_7_23_tracklists.json', 'w') as outfile:
    json.dump(tracklists, outfile)

28918it [00:02, 10169.90it/s]


In [31]:
i=0
for tracklist in tracklists.values():
    for track in tracklist["tracks"]:
        if track["artist"] == "Portishead":
            print(track["uid"], track["artist"], track["title"])


d3048c7d07f6b93a29564a54bd5d4d38 Portishead The Rip
d3048c7d07f6b93a29564a54bd5d4d38 Portishead The Rip
672afd3aeaadb39bb260e1842de7d3db Portishead Only You
f947c13055641c8cc8599ef98f96d330 Portishead It Could Be Sweet
61dbb000c19cf2871957fd4717083491 Portishead Machine Gun (Chopped & Screwed)
eaf4bac8ef0e572aa2b37b4a394fdfe8 Portishead Deep Water
47e8ea6309d3cce5b7d611800de56426 Portishead Roads
f947c13055641c8cc8599ef98f96d330 Portishead It Could Be Sweet
44851cdbff49c7380e729ab6507cee08 Portishead Sour Times
f947c13055641c8cc8599ef98f96d330 Portishead It Could Be Sweet
ac439000878bd30ca3f8f7a33db1566b Portishead Silence
47e8ea6309d3cce5b7d611800de56426 Portishead Roads
0f97097dbd7c2c68f0d6a84a79e43d6e Portishead Glory Box
47e8ea6309d3cce5b7d611800de56426 Portishead Roads
672afd3aeaadb39bb260e1842de7d3db Portishead Only You
0f97097dbd7c2c68f0d6a84a79e43d6e Portishead Glory Box
a46f5d21dc34f2c232975aa5f33ae851 Portishead Machine Gun
f947c13055641c8cc8599ef98f96d330 Portishead It Could

## create tracks file

In [30]:
unique_tracks_dict = {}

for show in tracklists.values():
    for track in show['tracks']:
        unique_tracks_dict[track['uid']] = track
        # remove uid from unique_tracks_dict[track['uid']]:
        del unique_tracks_dict[track['uid']]['uid']

# if you want to convert it back to a list of track dictionaries
unique_tracks_list = list(unique_tracks_dict.values())

In [31]:
with open('../data/tracks.json', 'w') as f:
    json.dump(unique_tracks_dict, f)

In [32]:
unique_tracks_dict

{'1d2a1a45-1e0f-4366-b1c1-240259ca16bc': {'artist': 'Fredde Viadukt',
  'title': 'Marvel Of Miracles'},
 '032551a3-1296-40e6-a66c-dd15184e8f87': {'artist': 'Kosmoksen Errako',
  'title': 'Jolkottelevat'},
 '47f13dea-2b51-4668-b668-85535a765ba5': {'artist': 'Nyati Mayi & The Astral Synth Transmitters',
  'title': 'Lolokele'},
 'ff277537-6c3e-45f3-8cef-a89709e58044': {'artist': 'Big Daddy',
  'title': "Cupid's Itch "},
 '5f8e84eb-6552-4f1d-96ea-5fc5e9670196': {'artist': "Jade, Lil' Rachett, VAZ",
  'title': "Keep On Risin' (No Rap Version)"},
 'a90c2a93-0af7-4ed9-9205-dbac7748fecc': {'artist': 'By Chance',
  'title': 'Revenge'},
 'cc78f0d8-db19-4e14-82ff-f820c36db752': {'artist': 'Die Radierer',
  'title': 'Batman'},
 '6065bae1-c942-4d1a-b278-d99bcb0919c8': {'artist': 'Sagat',
  'title': 'Money Magic'},
 '6671453d-aa9f-406d-b0da-0238a56585fe': {'artist': 'Tapes',
  'title': 'Parrot Samba'},
 '7f31ce03-e9c0-4a94-b4bb-3d00556031ac': {'artist': 'Mike Brooks',
  'title': 'On The Ice (Version