In [295]:
import os
import xmltodict
import pandas as pd
import glob
import numpy as np

In [2]:
def load_xml_dict(path):
        """
        Returns xml as a dict
        :param path to xml file
        :return dict object
        """
        xml = open(path, encoding="utf-8",  mode="r") 
        dict_obj = xmltodict.parse(xml.read(),process_namespaces=True,dict_constructor=dict)
        return dict_obj
    

# loading all XMLS

In [18]:
song_paths = glob.glob('/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/XML-Projects/*')

# CSV paths: 


In [19]:
csvs = glob.glob('/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/*/*/*')

In [20]:
csvs

['/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/RockinDck 3.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 4.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 5.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/RockinDck 2.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 7.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 6.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/RockinDck 1.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 2.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/

# Extracting data from xmls : 
Loading all the data in dataframe is one way to do this. But this is not feasible computationally. 
Hence, I will write a method to handle the xml and extract all possible fields when needed dynamically, thereby avoiding loading all the files in the memory at once.


# Task 1. Parse the XML files and extract information

In [21]:
data = load_xml_dict(song_paths[0])

In [22]:
data['project'].keys()

dict_keys(['@volume', '@bpm', '@cut_mode', 'used_chords', 'user_loops', 'parts'])

In [360]:
class extract():
    def __init__(self, path):
        self.path = path
        self.dict_xml = self.load_xml_dict() 
        self.filename = self.path.split('/')[-1].split('.')[0]
#         self.dict_obj = self.load_xml_dict()
        
    def load_xml_dict(self):
        """
        Returns xml as a dict
        :param path to xml file
        :return dict object
        """
        xml = open(self.path, encoding="utf-8",  mode="r") 
        dict_obj = xmltodict.parse(xml.read(),process_namespaces=True,dict_constructor=dict)
        return dict_obj
    
    def get_basic_features(self):
        
        """Returns features one song - volume, bpm, cut_mode, used_chords and user_loops"""
        
        dict = {'vol': self.dict_xml['project']['@volume'],
                'bpm': self.dict_xml['project']['@bpm'],
                'cut_mode': self.dict_xml['project']['@cut_mode'],
                'used_chords':self.dict_xml['project']['used_chords'],
                'user_loops':self.dict_xml['project']['user_loops']}
        return dict
    
    def all_parts(self):
        """ Internal function"""
        return pd.DataFrame(dict_xml['project']['parts']['part'])
    
    
    def chord_sequence(self):
        """returns chord sequence of one song. use this for analysis of type of music!!"""
        df = self.all_parts()
        return df.chord_sequence
        
    def part_name(self):
        """REturns part name of all the parts (25) from the xml root"""
        df = self.all_parts()
        return df['@name']
    
    def length_in_beats(self):
        """Returns the length_in_beats for a song - Could be used in analysis"""
        df = self.all_parts()
        return df['@length_in_beats']
    
    def pitch_sequence(self):
        """Returns the pitch sequence for a song - Could be used in analysis"""
        df = self.all_parts()
        return df['@pitch']
    
    def return_full_song_sequence(self):
        """Returns the complete sequence of the song. So no need to deal with looping in xml anymore.
        Simply call any song and get the full sequence- total length -200"""
        loops = []    # length should be equal to 25*8 - 25 parts per song and 8 channels per part
        volume = []
        is_active = []

        for i in range(len(df)):
            for j in range(len(pd.DataFrame(df.channels[i]['channel'])['@volume'])):
                loops.append(pd.DataFrame(df.channels[i]['channel'])['@loop'][j])
                volume.append(pd.DataFrame(df.channels[i]['channel'])['@volume'][j])
                is_active.append(pd.DataFrame(df.channels[i]['channel'])['@is_active'][j])

        assert len(loops)==len(volume)==len(is_active)
        dff = pd.DataFrame({'loops':loops,'volume':volume,'is_active':is_active})
        return dff

    def clean_loops(self):
        loops_clean = []
        d = self.return_full_song_sequence()
        for i in range(len(d)):
            loops_clean.append('/'.join(d.loops[i].split('/')[4:]))
        return loops_clean
    
    
    def get_song_data(self, csvs): 
        """Returns the song data from csv files for one song"""
        d = self.clean_loops()
        path_str = '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/'
        data = []
        for i in d:
            s = path_str + i + '.csv'   # Very baddd way!! but since small dataset we can let it be :p
            if s in csvs:  
#                 print(np.array(pd.read_csv(s))[0][:2])

                data.append(np.array(pd.read_csv(s))[0][2:])
            else : 
                s = path_str + i + ' 1' + '.csv'
                if s in csvs :
                    data.append(np.array(pd.read_csv(s))[0][2:])
                else :
                    s = path_str + i + '  1' + '.csv'
                    data.append(np.array(pd.read_csv(s))[0][2:])
    
        return pd.DataFrame(data, columns = ['label','d1','d2','d3','d4','d5','d6','d7','d8'])
    


In [361]:
extractor = extract(song_paths[0])

In [314]:
extractor.get_basic_features()

{'vol': '50',
 'bpm': '120',
 'cut_mode': 'Beat',
 'used_chords': 'aa#CdeFG',
 'user_loops': None}

In [129]:
extractor.return_full_song_sequence()

Unnamed: 0,loops,volume,is_active
0,mmj://styles/id/ST_Electro_Pop/Drums/Circus_Set A,0,true
1,mmj://styles/id/ST_Electro_Pop/Bass/80s_Bass,0,true
2,mmj://styles/id/ST_Electro_Pop/Fx/Bender_Fx,0,true
3,mmj://styles/id/ST_Electro_Pop/Pad/Chill_Out,0,true
4,mmj://styles/id/ST_Electro_Pop/Sequence/ARP_Seq,0,true
...,...,...,...
195,mmj://styles/id/ST_Electro_Pop/Pad/Gated_Pad,80,true
196,mmj://styles/id/ST_Electro_Pop/Sequence/ARP_Seq,0,true
197,mmj://styles/id/ST_Electro_Pop/Strings/Big_Str...,0,true
198,mmj://styles/id/ST_Electro_Pop/Synth/Big_Chord,0,true


In [130]:
extractor.pitch_sequence()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
Name: @pitch, dtype: object

In [131]:
extractor.length_in_beats()

0     32
1     32
2     32
3     32
4     32
5     32
6     32
7     32
8     32
9     32
10    32
11    32
12    32
13    32
14    32
15    32
16    32
17    32
18    32
19    32
20    32
21    32
22    32
23    32
24    32
Name: @length_in_beats, dtype: object

In [132]:
extractor.part_name()

0                Intro
1            Intro (1)
2              Intro 2
3          Intro 2 (1)
4         Intermission
5     Intermission (1)
6                Hook 
7            Hook  (1)
8               Hook 2
9           Hook 2 (1)
10             Verse 1
11         Verse 1 (1)
12         Verse 1 (2)
13         Verse 1 (3)
14              Chorus
15          Chorus (1)
16          Chorus (2)
17          Chorus (3)
18          Chorus (4)
19            Chorus 2
20        Chorus 2 (1)
21        Chorus 2 (2)
22        Chorus 2 (3)
23        Chorus 2 (4)
24        Chorus 2 (5)
Name: @name, dtype: object

In [133]:
extractor.chord_sequence()

0     aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
1     aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
2     aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
3     aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
4     ddddddddaaaaaaaaFFFFFFFFCCCCCCCC
5     ddddddddaaaaaaaaFFFFFFFFGGGGGGGG
6     FFFFFFFFGGGGGGGGFFFFFFFFFFFFFFFF
7     FFFFFFFFGGGGGGGGFFFFFFFFFFFFFFFF
8     FFFFFFFFGGGGGGGGFFFFFFFFCCCCCCCC
9     FFFFFFFFGGGGGGGGFFFFFFFFFFFFGGGG
10    aaaaaaaaaaaaaaaaFFFFFFFFFFFFFFFF
11    aaaaaaaaaaaaaaaaFFFFFFFFFFFFFFFF
12    aaaaaaaaaaaaaaaaFFFFFFFFFFFFFFFF
13    aaaaaaaaaaaaaaaaFFFFFFFFFFFFFFFF
14    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
15    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
16    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
17    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
18    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
19    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
20    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
21    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
22    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
23    aaaaaaaaFFFFFFFFCCCCCCCCGGGGeeee
24    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
Name: chord_sequence, dty

# Removing the  “mmj://styles/id/” from all the loops 

In [270]:
extractor = extract(song_paths[1])

d = extractor.clean_loops()
d

['ST_Electro_Pop/Drums/Circus_Set A',
 'ST_Electro_Pop/Bass/80s_Bass',
 'ST_Electro_Pop/Fx/Bender_Fx',
 'ST_Electro_Pop/Pad/Chill_Out',
 'ST_Electro_Pop/Sequence/ARP_Seq',
 'ST_Electro_Pop/Strings/Big_Strings',
 'ST_Electro_Pop/Synth/Areal_Arp',
 'ST_Electro_Pop/Vocals/Listen Hook A',
 'ST_Electro_Pop/Drums/Funky_Elec D',
 'ST_Electro_Pop/Bass/80s_Bass',
 'ST_Electro_Pop/Fx/Bender_Fx',
 'ST_Electro_Pop/Pad/Chill_Out',
 'ST_Electro_Pop/Sequence/ARP_Seq',
 'ST_Electro_Pop/Strings/Big_Strings',
 'ST_Electro_Pop/Synth/Areal_Arp',
 'ST_Electro_Pop/Vocals/Listen Hook A',
 'ST_Electro_Pop/Drums/Funky_Elec A',
 'ST_Electro_Pop/Bass/TranceBass',
 'ST_Electro_Pop/Fx/Bender_Fx',
 'ST_Electro_Pop/Pad/Gated_Pad',
 'ST_Electro_Pop/Sequence/ARP_Seq',
 'ST_Electro_Pop/Strings/Big_Strings',
 'ST_Electro_Pop/Synth/Big_Chord',
 'ST_Electro_Pop/Vocals/Listen Hook A',
 'ST_Electro_Pop/Drums/Funky_Elec A',
 'ST_Electro_Pop/Bass/TranceBass',
 'ST_Electro_Pop/Fx/Bender_Fx',
 'ST_Electro_Pop/Pad/Gated_Pad',
 '

In [271]:
csvs

['/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/RockinDck 3.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 4.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 5.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/RockinDck 2.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 7.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 6.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/RockinDck 1.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/Glitches/HouseGlitch 2.csv',
 '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/ST_Electro_Lite/

# Task 2.  Use the data in the CSV files to replace the loop filename with the corresponding features (label + d1-d8)

In [158]:
# basically just read the filename from each loop, open the csv and get the label + d1-d8 featured. 
# Lets just add a function to retrieve this for each song file directly


In [272]:
extractor = extract(song_paths[1])
d = extractor.clean_loops()

In [325]:
path_str = '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/'
# s = path_str + d[1] + 'ss 1' + '.csv'   # this way generate in your computer!!
s = path_str + d[0]  + '.csv'   # this way generate in your computer!!

In [344]:
np.array(pd.read_csv(s))[0][2:]

array([0.1, 0.2483508015937223, 0.3900664737343702, 0.5256792190928795,
       0.5829058375636835, 0.2413369488123649, 0.36666384433735655,
       0.5439687316249178, 0.6849865814458234], dtype=object)

### if True - loop name is found, if false not found. In the task they have asked to add a whitespace if you dont find a match

In [343]:
s in csvs


True

In [328]:
# if false, just add a space  as mentioned the task description  :
s = path_str + d[1] + ' 1' + '.csv'   # this way generate in your computer!!
s in csvs
# if stilll false then add two spaces!

True

In [188]:
# final function to get the data with label from csv can now be written!!!!  # change the path string as per your computer

In [345]:
def get_song_data(self, csvs): 
    d = self.clean_loops()
    path_str = '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/'
    data = []
    for i in d[0]:
        s = path_str + i + '.csv'   # Very baddd way!! but since small dataset we can let it be :p
        if s in csvs:   
            print(np.array(pd.read_csv(s)))
            data.append(np.array(pd.read_csv(s))[0][2:])
        else : 
            s = path_str + i + ' 1' + '.csv'
            if s in csvs :
                data.append(np.array(pd.read_csv(s))[0][:2])
            else :
                s = path_str + i + '  1' + '.csv'
                data.append(np.array(pd.read_csv(s))[0][:2])
        
    return pd.DataFrame(data)

In [None]:
extractor.get_song_data(csvs)   # for one song

In [None]:
# do for all songs and any analysis you want!!