In [42]:
#!pip install lxml
#!pip install datetime
from lxml import etree as ET
from datetime import datetime
import pandas as pd
import os
from collections import Counter

dir_workspace = os.path.join(os.getcwd(),'workspace_GSO_Comparison')
file_name_cs = os.path.join(dir_workspace,'GSO_Repository_DB.gso')
file_name_gs = os.path.join(dir_workspace,'GSO_Repository_OOB.gso')
tree_cs = ET.parse(file_name_cs)
tree_gs = ET.parse(file_name_gs)

In [2]:
def convert_lxml_to_dict(lxml_dict):
    dict_out = {}
    for key in lxml_dict:
        dict_out[key] = lxml_dict[key]
    return dict_out

In [3]:
def get_element_dict2(tree, elm_name,unique_columns,id_col='id'):
    dict_element_id = {}
    dict_element_unique_col = {}
    for sub_element in tree.findall(".//" + elm_name):
        element_attrib = sub_element.attrib
        dict_element_id[element_attrib.get(id_col)] = convert_lxml_to_dict(element_attrib)
        unique_key = '-'.join([element_attrib.get(col) for col in unique_columns])
        dict_element_unique_col[unique_key] = convert_lxml_to_dict(element_attrib)
    return dict_element_id,dict_element_unique_col

In [20]:
def get_element_dict(tree, elm_name,id_col='id'):
    dict_element_id = {}
    for sub_element in tree.findall(".//" + elm_name):
        element_attrib = sub_element.attrib
        dict_element_id[element_attrib.get(id_col)] = convert_lxml_to_dict(element_attrib)
    return dict_element_id

In [77]:
def get_occ_field_dict(tree):
    dict_element = {}
    for sub_element in tree.findall('.//occurrence'):
        for elm in sub_element.findall('field'):
            element_attrib = elm.attrib
            dict_attr = convert_lxml_to_dict(element_attrib)
            dict_attr['occ_id'] = sub_element.attrib.get('id')
            dict_attr['occ_name'] = sub_element.attrib.get('name')
            if('name' not in dict_attr):
                dict_attr['name'] = dict_attr['relationId']
            dict_element[element_attrib.get('id')] = dict_attr
    return dict_element

In [4]:
def get_mapping_dict(tree):
    dict_element_mapping = {}
    for sub_element in tree.findall('.//mapping'):
        element_attrib = sub_element.attrib
        if(element_attrib.get('beFieldId') not in dict_element_mapping.keys()):
            dict_element_mapping[element_attrib.get('beFieldId')] = []
        dict_element_mapping[element_attrib.get('beFieldId')].append(element_attrib.get('occFieldId'))
    return dict_element_mapping

In [78]:
dict_cs_befield_id,dict_cs_befield_name = get_element_dict2(tree_cs,'beField',['entityTypeId','name'])
dict_gs_befield_id,dict_gs_befield_name = get_element_dict2(tree_gs,'beField',['entityTypeId','name'])
dict_cs_type_id = get_element_dict(tree_cs,'entiyType')
dict_cs_mapping = get_mapping_dict(tree_cs)
dict_gs_mapping = get_mapping_dict(tree_gs)
dict_cs_occ_fields = get_occ_field_dict(tree_cs)
dict_gs_occ_fields = get_occ_field_dict(tree_gs)
print(len(dict_gs_occ_fields))

16309


In [82]:
list_comp_cols = ['name','mandatory','multiplicity','class','customSQL']
list_result = []
for key in dict_cs_befield_id:
    dict_attr_cs = dict_cs_befield_id[key]
    dict_attr_cmp_cs = {key:dict_attr_cs[key] for key in dict_attr_cs if key in list_comp_cols}
    key_unique = dict_attr_cs['entityTypeId']+'-'+dict_attr_cs['name']
    
    ERR_CODE = None
    ERR_DIFF_1 = None
    ERR_DIFF_2 = None
    CROSS_REF_ID = None
    if(dict_attr_cs.get('nameSpace') == 'GSC'):
        if(key in dict_gs_befield_id.keys()):
            dict_attr_gs = dict_gs_befield_id[key]
            dict_attr_cmp_gs = {key:dict_attr_gs[key] for key in dict_attr_gs if key in list_comp_cols}
            dict_diff = dict(set(dict_attr_cmp_cs.items())-set(dict_attr_cmp_gs.items()))
            mapping_cs = dict_cs_mapping[key] if key in dict_cs_mapping.keys() else []
            mapping_gs = dict_gs_mapping[key] if key in dict_gs_mapping.keys() else []
            if(len(dict_diff)>0):
                ERR_CODE = 'ERR_DATA_MISMATCH'
                ERR_DIFF_1 = ', '.join([k + ' = '+dict_attr_cmp_cs[k] for k in dict_diff if k in dict_attr_cmp_cs])
                ERR_DIFF_2 = ', '.join([k + ' = '+dict_attr_cmp_gs[k] for k in dict_diff if k in dict_attr_cmp_gs])
            if(Counter(set(mapping_cs)) != Counter(set(mapping_gs))):
                ERR_CODE = 'ERR_MAPING_MISMATCH'
                ERR_DIFF_1 = ','.join([dict_cs_occ_fields[val]['occ_name']+'.'+dict_cs_occ_fields[val]['name'] 
                              for val in mapping_cs if val in dict_cs_occ_fields])
                ERR_DIFF_2 = ','.join([dict_gs_occ_fields[val]['occ_name']+'.'+dict_gs_occ_fields[val]['name'] 
                              for val in mapping_gs if val in dict_gs_occ_fields])
        else:
            if(key_unique in dict_gs_befield_name.keys()):
                ERR_CODE = 'ERR_ID_NAME_CONFLICT'
                CROSS_REF_ID = 'Cross ref id : '+dict_gs_befield_name[key_unique]['id']
            else:
                ERR_CODE = 'ERR_EXTRA_GS_FIELD'
    else:   
        if(key_unique in dict_gs_befield_name.keys()):
            dict_attr_gs = dict_gs_befield_name[key_unique]
            if(dict_attr_gs.get('nameSpace') == 'GSC'):
                ERR_CODE = 'ERR_ID_NAME_CONFLICT'
                ERR_DETAIL = 'Cross ref id : '+dict_gs_befield_name[key_unique]['id']
    if(ERR_CODE != None):
        dict_result = {}
        gso_type_name = dict_cs_type_id[dict_attr_cs['entityTypeId']]['name']
        dict_result['ID'] = dict_attr_cs['id']
        dict_result['NAME'] = dict_attr_cs['name']
        dict_result['TYPE'] = gso_type_name
        dict_result['ERR_CODE'] = ERR_CODE
        dict_result['ERR_DIFF_1'] = ERR_DIFF_1
        dict_result['ERR_DIFF_2'] = ERR_DIFF_2
        dict_result['CROSS_REF_ID'] = CROSS_REF_ID
        list_result.append(dict_result)

print(len(list_result))

1271


In [83]:
df = pd.DataFrame(list_result)
df.to_csv(os.path.join(dir_workspace,'field_conflict.csv'))