## XML Data Processing ##

In [1]:
from lxml import etree as ET
import os.path
from os import path
import shutil
import zipfile
import pandas as pd

dir_workspace = os.path.join(os.getcwd(),'workspace_Xml_Data_Processing')
dir_input = os.path.join(dir_workspace,'input')
dir_staging = os.path.join(dir_workspace,'staging')
dir_output = os.path.join(dir_workspace,'output')
print(dir_workspace)

D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing


#### Function : replace_error  ####
* To replace the error in xml string before parsing 

In [2]:
def replace_error(str_input):
    str_first = str_input[0:str_input.index('<Document xmlns')+9]
    str_part = str_input[str_input.index('<Document xmlns'):]
    str_last = str_part[str_part.index('>'):]
    str_output = str_first + str_last
    return str_output

#### Function : get_event_type  ####
* To get the Event Type by parsing the xml string 
* It the tag 'CorpActnGnlInf/EvtTp' not found then return None

In [3]:
# Xml Parsing Code #
def get_event_type(str_xml):
    event_type = None
    if('<Document xmlns' in str_xml):
        str_xml = replace_error(str_xml)
        tree = ET.fromstring(str_xml)
        for elm_CorpActnGnlInf in tree.findall('.//CorpActnGnlInf'):
            for elm_EvtTp in elm_CorpActnGnlInf.findall('EvtTp'):
                for elm_Cd in elm_EvtTp.getchildren():
                    event_type = elm_Cd.text
                    break
    return event_type

In [4]:
def read_file(filename):
    with open(filename, 'r') as content_file:
        content = content_file.read()
        list_xml = content.split('<?xml version="1.0"?>')
        return list_xml[1:]

#### Function : get_file_info  ####
* To get the counts of all event types for given file 
* If event type not found then return respective counts with label : 'NOT FOUND'

In [5]:
# File Reading Code #
def get_file_info(filename):
    list_xml = read_file(filename)
    list_event_type_found = [get_event_type(x) for x in list_xml if get_event_type(x) != None]
    count_event_type_not_found = len([x for x in list_xml if get_event_type(x) == None])
    
    dict_file_info = {}
    dict_file_info['FILE_NAME'] = filename.replace(dir_staging+'\dtc_cano.xml.','').replace('.txt','')
    for ev_type in list_event_type_found:
        dict_file_info[ev_type] = dict_file_info[ev_type] + 1 if(ev_type in dict_file_info.keys()) else 1
    dict_file_info['NOT_FOUND'] = count_event_type_not_found
    return dict_file_info

In [6]:
def group_by_event_type(filename,dict_event_type):
    list_xml = read_file(filename)
    
    for str_xml in list_xml:
        event_type = get_event_type(str_xml)
        if(event_type != None):
            if (not(event_type in dict_event_type.keys())):
                dict_event_type[event_type] = []
            dict_event_type[event_type].append(str_xml)

#### File Preperation  ####
* Unzip all input files from input directory to staging directory 

In [7]:
# Creating Staging Directory
if (path.exists(dir_staging)):
    shutil.rmtree(dir_staging)
os.mkdir(dir_staging)

for filename in sorted(os.listdir(dir_input)):
    filepath = os.path.join(dir_input, filename)
    with zipfile.ZipFile(filepath, 'r') as zip_ref:
        zip_ref.extractall(dir_staging)
        print('Extracted ... '+ filepath)


Extracted ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\input\dtc_cano.xml.20200701_185007.txt.zip
Extracted ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\input\dtc_cano.xml.20200702_185003.txt.zip
Extracted ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\input\dtc_cano.xml.20200703_185003.txt.zip
Extracted ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\input\dtc_cano.xml.20200705_185003.txt.zip
Extracted ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\input\dtc_cano.xml.20200706_185006.txt.zip
Extracted ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\input\dtc_cano.xml.20200707_185001.txt.zip
Extracted ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\input\dtc_cano.xml.20200708_185008.txt.zip
Extracted ... D:\GIT_Repositories\

#### File Preocessing ####
* Processed each file by calling function get_file_info and collect in the list

In [8]:
## Xml Processing Main Program
list_file_info = []
for filename in sorted(os.listdir(dir_staging)):
    filepath = os.path.join(dir_staging, filename)
    list_file_info.append(get_file_info(filepath))
    print('Processed ... '+filepath)

print('Successfully processed '+str(len(list_file_info))+ ' files.')

Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200701_185007.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200702_185003.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200703_185003.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200705_185003.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200706_185006.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200707_185001.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200708_185008.txt
Processed ... D:\GIT_Repositories\DevFactory\Jup

#### Final Dashboard for Analysis ####

In [9]:
df = pd.DataFrame(list_file_info)
df.reset_index(drop=True, inplace=True)
df.to_csv(os.path.join(dir_workspace,'summary.csv'),index=False) 
df

Unnamed: 0,BIDS,BPUT,BRUP,CHAN,CONS,CONV,DETI,DFLT,DRAW,DTCH,...,NOOF,NOT_FOUND,OTHR,PDEF,PLAC,REDM,SPLF,SPLR,TEND,WRTH
0,35.0,531.0,2.0,15.0,33.0,122.0,,4.0,420.0,2.0,...,,0,208.0,135.0,2.0,378.0,,26,3.0,9.0
1,32.0,5.0,2.0,20.0,15.0,51.0,1.0,,853.0,,...,,0,57.0,190.0,,397.0,,30,6.0,10.0
2,2.0,86.0,,2.0,11.0,33.0,,,929.0,,...,,0,66.0,1639.0,,3946.0,,10,,7.0
3,,,,1.0,,,,,,,...,,0,9.0,,,,,1,,2.0
4,8.0,22.0,12.0,12.0,6.0,54.0,,,760.0,,...,2.0,0,10.0,81.0,,345.0,,23,6.0,5.0
5,20.0,16.0,26.0,5.0,12.0,71.0,,,459.0,,...,,0,14.0,75.0,2.0,390.0,,27,10.0,11.0
6,60.0,4.0,,10.0,1.0,37.0,,,243.0,,...,1.0,0,45.0,93.0,1.0,362.0,,29,6.0,16.0
7,17.0,17.0,2.0,10.0,10.0,46.0,,1.0,289.0,1.0,...,,0,45.0,78.0,1.0,337.0,,23,2.0,17.0
8,27.0,20.0,,17.0,75.0,41.0,,,406.0,,...,3.0,0,33.0,82.0,,261.0,,27,9.0,17.0
9,,,,1.0,,,,,,,...,,0,10.0,,,,,1,,2.0


In [10]:
## Xml Processing to Group base on Event Type
dict_event_type = {}
for filename in sorted(os.listdir(dir_staging)):
    filepath = os.path.join(dir_staging, filename)
    group_by_event_type(filepath,dict_event_type)
    print('Processed ... '+filepath)


Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200701_185007.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200702_185003.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200703_185003.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200705_185003.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200706_185006.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200707_185001.txt
Processed ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\staging\dtc_cano.xml.20200708_185008.txt
Processed ... D:\GIT_Repositories\DevFactory\Jup

In [11]:
# Creating Output Directory
if (path.exists(dir_output)):
    shutil.rmtree(dir_output)
os.mkdir(dir_output)

for event_type in dict_event_type:
    content = ''.join(dict_event_type[event_type])
    filepath = os.path.join(dir_output,'EventType_'+event_type+'.txt')
    textfile = open(filepath, 'w')
    textfile.write(content)
    textfile.close()
    print('Successfully created ... '+filepath)

Successfully created ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\output\EventType_BIDS.txt
Successfully created ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\output\EventType_EXRI.txt
Successfully created ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\output\EventType_REDM.txt
Successfully created ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\output\EventType_MRGR.txt
Successfully created ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\output\EventType_OTHR.txt
Successfully created ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\output\EventType_PLAC.txt
Successfully created ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\workspace_Xml_Data_Processing\output\EventType_SPLR.txt
Successfully created ... D:\GIT_Repositories\DevFactory\JupyterNoteBook_Dev\

In [12]:
import platform
platform.architecture()

('64bit', 'WindowsPE')