In [1]:
#
# user settings
#
input_xml_file = 'test.xml.gz'
output_csv_file = 'tracks.csv'

In [59]:
#
# load useful libraries
#
import json
import xmltodict
import pandas as pd
import numpy as np
import gzip
import re
#from neo4j import GraphDatabase

In [7]:
with gzip.open(input_xml_file, 'rb') as f:
    #file_content = f.read()
    data_dict = xmltodict.parse(f.read())

In [10]:
#
# iterate through the tracks
#
track_item_list = []
for track in data_dict['DJ_PLAYLISTS']['COLLECTION']['TRACK']:

    #
    # retrieve basic track information
    #
    track_dict_original = {
        'name' : track['@Name'].strip(),
        'artist' : track['@Artist'].strip(),
        'album' : track['@Album'].strip(),
        'year' : track['@Year'].strip(),
        'genre' : track['@Genre'].strip().lower(),
        'tonality' : track['@Tonality'].strip(),
        'bpm_average' : track['@AverageBpm'].strip(),
    }

    #
    # non-numeric keys
    #
    try:
        if not track_dict_original['tonality'][0].isdigit():
            track_dict_original['tonality'] = None
    except:
        track_dict_original['tonality'] = None
    
    #
    # extract energy level
    #
    comments = track['@Comments'].strip()
    try:
        track_dict_original['energy'] = np.int64(
            comments
            .lower()
            .split('energy ')[-1]
            .split(' ')[0]
        )
    except:
        track_dict_original['energy'] = None

    #
    # Bpm
    #
    track_dict_original['bpm_average'] = np.float64(track_dict_original['bpm_average'])
        
    #
    # deal with NA values
    #
    track_dict = {}
    for key in track_dict_original:
        if track_dict_original[key] == '':
            track_dict[key] = None
        else:
            track_dict[key] = track_dict_original[key]
    del(track_dict_original)

    #
    # append
    #
    track_item_list.append(track_dict)


In [13]:
#
# assemble data frame
#
df = pd.DataFrame(track_item_list).dropna().reset_index().drop(columns = ['index'])
df['song_id'] = df.index

df['year'] = [int(x) for x in df['year']]
df['energy'] = [int(x) for x in df['energy']]



In [16]:
df.head()

Unnamed: 0,name,artist,album,year,genre,tonality,bpm_average,energy,song_id
0,"Sine From Above (Chester Lockhart, Mood Killer...",Lady Gaga,Dawn Of Chromatica,2021,edm,5A,174.92,7,0
1,Let the Bass Kick In Miami Bitch,Chuckie & LMFAO,100% Clubland EDM Bangers,0,edm,8A,128.11,7,1
2,Sorry For Party Rocking,LMFAO,Clubland 21 (CD1),2012,dance,6A,134.0,7,2
3,On My Way to Hell,Połoz & Tinnitus,"Cyberpunk 2077: Radio, Vol. 2 (Original Soundt...",2020,soundtracks,9A,137.03,5,3
4,Kooler Than Jesus (Electric Messiah Mix),My Life With The Thrill Kill Kult,Confessions Of A Knife,1990,electronica,11A,124.67,7,4


In [23]:
df_artist = pd.DataFrame(
    {
        'artist_name' : df['artist'].unique(),  # assumes forward and rear whitespace is removed
    }
)
df_artist.reset_index(inplace = True)
df_artist.rename(columns = {'index' : 'artist_id'}, inplace = True)

In [25]:
df_artist

Unnamed: 0,artist_id,artist_name
0,0,Lady Gaga
1,1,Chuckie & LMFAO
2,2,LMFAO
3,3,Połoz & Tinnitus
4,4,My Life With The Thrill Kill Kult
...,...,...
1822,1822,KXD-LvL
1823,1823,Alex Farell
1824,1824,Divina Blasfemia
1825,1825,Lethargy


In [33]:
# why do this come up empty?
df_artist[df_artist['artist_name'] == '']
df_artist[df_artist['artist_name'].isna()]

Unnamed: 0,artist_id,artist_name


In [27]:
df_album = pd.DataFrame({'album_name' : df['album'].unique()})
df_album

Unnamed: 0,album_name
0,Dawn Of Chromatica
1,100% Clubland EDM Bangers
2,Clubland 21 (CD1)
3,"Cyberpunk 2077: Radio, Vol. 2 (Original Soundt..."
4,Confessions Of A Knife
...,...
1134,The Eternal
1135,Meliora (Deluxe Edition)
1136,Alchemical: Volume 1 [Explicit]
1137,Mary On A Cross (slowed + reverb)


In [120]:
def extract_number(s):
    match = re.search(r'\d+', s)
    if match:
        return int(match.group())
    else:
        return 0 # default

df_tonality = pd.DataFrame(
    {
        'tonality' : sorted(sorted(df['tonality'].unique()), key = extract_number)
    }
)
df_tonality.reset_index(inplace = True)
df_tonality.rename(columns = {'index' : 'tonality_id'}, inplace = True)

df_tonality['scale_indicator'] = [x[-1] for x in df_tonality['tonality']]
df_tonality['tonic_indicator'] = [extract_number(x) for x in df_tonality['tonality']]
df_tonality['mood'] = ['major' if x == 'B' else 'minor' for x in df_tonality['scale_indicator']]

df_tonality

Unnamed: 0,tonality_id,tonality,scale_indicator,tonic_indicator,mood
0,0,1A,A,1,minor
1,1,1B,B,1,major
2,2,2A,A,2,minor
3,3,2B,B,2,major
4,4,3A,A,3,minor
5,5,3B,B,3,major
6,6,4A,A,4,minor
7,7,4B,B,4,major
8,8,5A,A,5,minor
9,9,5B,B,5,major


In [126]:
from badass_music.theory.western.CircleOfFifths import CircleOfFifths
from badass_music.DJ.CamelotWheel import CamelotWheel

chromatic_scale_pitch_classes = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'F#', 'G', 'Ab', 'A', 'Bb', 'B']
scale_enumeration_indices = np.arange(0, 12, dtype = np.uint8)

cf = CircleOfFifths(
    chromatic_scale_pitch_classes = chromatic_scale_pitch_classes,
    chromatic_scale_numeric = scale_enumeration_indices,    
)

cw = CamelotWheel(cf)

temp = []
for outer_key in cw.camelot_combined_minor_and_major:
    for inner_key in outer_key:
        temp.append(
            {
                'key' : inner_key[0],
                'tonality' : inner_key[1],
            }
        )

df_tonality2 = (
    pd.merge(
        df_tonality,
        pd.DataFrame(temp),
        how = 'left',
        on = ['tonality'],
    )
)

df_tonality2

Unnamed: 0,tonality_id,tonality,scale_indicator,tonic_indicator,mood,key
0,0,1A,A,1,minor,Ab
1,1,1B,B,1,major,B
2,2,2A,A,2,minor,Eb
3,3,2B,B,2,major,F#
4,4,3A,A,3,minor,Bb
5,5,3B,B,3,major,Db
6,6,4A,A,4,minor,F
7,7,4B,B,4,major,Ab
8,8,5A,A,5,minor,C
9,9,5B,B,5,major,Eb


In [None]:
#
# save
#
df.to_csv(output_csv_file, index=False)