In [None]:
import os
import xmltodict
import pandas as pd
import glob
import numpy as np

In [None]:
def load_xml_dict(path):
        """
        Returns xml as a dict
        :param path to xml file
        :return dict object
        """
        xml = open(path, encoding="utf-8",  mode="r") 
        dict_obj = xmltodict.parse(xml.read(),process_namespaces=True,dict_constructor=dict)
        return dict_obj
    

# loading all XMLS

In [None]:
song_paths = glob.glob('/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/XML-Projects/*')

# CSV paths: 


In [None]:
csvs = glob.glob('/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/*/*/*')

In [None]:
csvs

# Extracting data from xmls : 
Loading all the data in dataframe is one way to do this. But this is not feasible computationally. 
Hence, I will write a method to handle the xml and extract all possible fields when needed dynamically, thereby avoiding loading all the files in the memory at once.


# Task 1. Parse the XML files and extract information

In [None]:
data = load_xml_dict(song_paths[0])

In [None]:
data['project'].keys()

In [None]:
class extract():
    def __init__(self, path):
        self.path = path
        self.dict_xml = self.load_xml_dict() 
        self.filename = self.path.split('/')[-1].split('.')[0]
#         self.dict_obj = self.load_xml_dict()
        
    def load_xml_dict(self):
        """
        Returns xml as a dict
        :param path to xml file
        :return dict object
        """
        xml = open(self.path, encoding="utf-8",  mode="r") 
        dict_obj = xmltodict.parse(xml.read(),process_namespaces=True,dict_constructor=dict)
        return dict_obj
    
    def get_basic_features(self):
        
        """Returns features one song - volume, bpm, cut_mode, used_chords and user_loops"""
        
        dict = {'vol': self.dict_xml['project']['@volume'],
                'bpm': self.dict_xml['project']['@bpm'],
                'cut_mode': self.dict_xml['project']['@cut_mode'],
                'used_chords':self.dict_xml['project']['used_chords'],
                'user_loops':self.dict_xml['project']['user_loops']}
        return dict
    
    def all_parts(self):
        """ Internal function"""
        return pd.DataFrame(dict_xml['project']['parts']['part'])
    
    
    def chord_sequence(self):
        """returns chord sequence of one song. use this for analysis of type of music!!"""
        df = self.all_parts()
        return df.chord_sequence
        
    def part_name(self):
        """REturns part name of all the parts (25) from the xml root"""
        df = self.all_parts()
        return df['@name']
    
    def length_in_beats(self):
        """Returns the length_in_beats for a song - Could be used in analysis"""
        df = self.all_parts()
        return df['@length_in_beats']
    
    def pitch_sequence(self):
        """Returns the pitch sequence for a song - Could be used in analysis"""
        df = self.all_parts()
        return df['@pitch']
    
    def return_full_song_sequence(self):
        """Returns the complete sequence of the song. So no need to deal with looping in xml anymore.
        Simply call any song and get the full sequence- total length -200"""
        loops = []    # length should be equal to 25*8 - 25 parts per song and 8 channels per part
        volume = []
        is_active = []

        for i in range(len(df)):
            for j in range(len(pd.DataFrame(df.channels[i]['channel'])['@volume'])):
                loops.append(pd.DataFrame(df.channels[i]['channel'])['@loop'][j])
                volume.append(pd.DataFrame(df.channels[i]['channel'])['@volume'][j])
                is_active.append(pd.DataFrame(df.channels[i]['channel'])['@is_active'][j])

        assert len(loops)==len(volume)==len(is_active)
        dff = pd.DataFrame({'loops':loops,'volume':volume,'is_active':is_active})
        return dff

    def clean_loops(self):
        loops_clean = []
        d = self.return_full_song_sequence()
        for i in range(len(d)):
            loops_clean.append('/'.join(d.loops[i].split('/')[4:]))
        return loops_clean
    
    
    def get_song_data(self, csvs): 
        """Returns the song data from csv files for one song"""
        d = self.clean_loops()
        path_str = '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/'
        data = []
        for i in d:
            s = path_str + i + '.csv'   # Very baddd way!! but since small dataset we can let it be :p
            if s in csvs:  
#                 print(np.array(pd.read_csv(s))[0][:2])

                data.append(np.array(pd.read_csv(s))[0][2:])
            else : 
                s = path_str + i + ' 1' + '.csv'
                if s in csvs :
                    data.append(np.array(pd.read_csv(s))[0][2:])
                else :
                    s = path_str + i + '  1' + '.csv'
                    data.append(np.array(pd.read_csv(s))[0][2:])
    
        return pd.DataFrame(data, columns = ['label','d1','d2','d3','d4','d5','d6','d7','d8'])
    


In [None]:
extractor = extract(song_paths[0])

In [None]:
extractor.get_basic_features()

In [None]:
extractor.return_full_song_sequence()

In [None]:
extractor.pitch_sequence()

In [None]:
extractor.length_in_beats()

In [None]:
extractor.part_name()

In [None]:
extractor.chord_sequence()

# Removing the  “mmj://styles/id/” from all the loops 

In [None]:
extractor = extract(song_paths[1])

d = extractor.clean_loops()
d

In [None]:
csvs

# Task 2.  Use the data in the CSV files to replace the loop filename with the corresponding features (label + d1-d8)

In [None]:
# basically just read the filename from each loop, open the csv and get the label + d1-d8 featured. 
# Lets just add a function to retrieve this for each song file directly


In [None]:
extractor = extract(song_paths[1])
d = extractor.clean_loops()

In [None]:
path_str = '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/'
# s = path_str + d[1] + 'ss 1' + '.csv'   # this way generate in your computer!!
s = path_str + d[0]  + '.csv'   # this way generate in your computer!!

In [None]:
np.array(pd.read_csv(s))[0][2:]

### if True - loop name is found, if false not found. In the task they have asked to add a whitespace if you dont find a match

In [None]:
s in csvs


In [None]:
# if false, just add a space  as mentioned the task description  :
s = path_str + d[1] + ' 1' + '.csv'   # this way generate in your computer!!
s in csvs
# if stilll false then add two spaces!

In [None]:
# final function to get the data with label from csv can now be written!!!!  # change the path string as per your computer

In [None]:
def get_song_data(self, csvs): 
    d = self.clean_loops()
    path_str = '/Users/ankit.singh/e2e/kva_exploratory/Scientist_Testdata/Mix-Packs/'
    data = []
    for i in d[0]:
        s = path_str + i + '.csv'   # Very baddd way!! but since small dataset we can let it be :p
        if s in csvs:   
            print(np.array(pd.read_csv(s)))
            data.append(np.array(pd.read_csv(s))[0][2:])
        else : 
            s = path_str + i + ' 1' + '.csv'
            if s in csvs :
                data.append(np.array(pd.read_csv(s))[0][:2])
            else :
                s = path_str + i + '  1' + '.csv'
                data.append(np.array(pd.read_csv(s))[0][:2])
        
    return pd.DataFrame(data)

In [None]:
extractor.get_song_data(csvs)   # for one song

In [None]:
# do for all songs and any analysis you want!!