# Defused XML Testing for Python DxReview

### Imports

In [1]:
from defusedxml import ElementTree as ET
import pandas as pd

### Constants

In [2]:
xml_path = 'test.xml'
bad_root_xml_path = 'bad_root.xml'
proj_info_output = 'proj_info_output.xlsx'

### Parsing and Exploration
Parse the XML file, get the root element and explore the structure

In [3]:
doc = ET.parse(xml_path)
root = doc.getroot()

In [4]:
print(f'Root tag: {root.tag}')
print(f'Root attributes: {root.attrib}')

Root tag: ProjNet
Root attributes: {}


In [5]:
def verify_root(root):
    try:
        if root.tag.lower() == 'projnet':
            return True
        else:
            return False
    except:
        return False

In [6]:
print(verify_root(root))

True


In [7]:
for child in root:
    print(child.tag, child.attrib)

DrChecks {}
Comments {}


### Create startup methods

In [8]:
def get_root(path):
    try:
        root = ET.parse(path).getroot()
        if root.tag.lower() == 'projnet':
            return root
    except:
        pass

In [9]:
root2 = get_root(xml_path)
print(root2.tag, type(root2))

ProjNet <class 'xml.etree.ElementTree.Element'>


In [10]:
bad_root = get_root(bad_root_xml_path)
print(bad_root, type(bad_root))

None <class 'NoneType'>


### DrChecks Element - Project Information Element
The \<DrChecks> is always first under the root element \<ProjNet>, therefore, to make the code more readable, we can assign constants

In [11]:
drchecks_index = 0
comments_index = 1

In [12]:
proj_info_element = root[drchecks_index]

In [13]:
for child in proj_info_element:
    print(child.tag, child.text, child.attrib)

ProjectID 069848 {}
ProjectControlNbr 511143 {}
ProjectName P530 Aircraft Intermediate Maintenance Facility, Kadena AB {}
ReviewID 235411 {}
ReviewName Concept Design Resubmittal {}


##### Read Project Info into a Dictionary

In [14]:
proj_info = {}

In [15]:
for child in proj_info_element:
    proj_info[child.tag] = child.text

In [16]:
proj_info

{'ProjectID': '069848',
 'ProjectControlNbr': '511143',
 'ProjectName': 'P530 Aircraft Intermediate Maintenance Facility, Kadena AB',
 'ReviewID': '235411',
 'ReviewName': 'Concept Design Resubmittal'}

In [17]:
proj_info['ProjectName']

'P530 Aircraft Intermediate Maintenance Facility, Kadena AB'

##### Remap the keys in the dictionary to more human readable language

In [18]:
key_remap = {
    'ProjectID': 'Project ID',
    'ProjectControlNbr': 'Project Control Number',
    'ProjectName': 'Project Name',
    'ReviewID': 'Review ID',
    'ReviewName': 'Review Name'
}

In [19]:
proj_info = {key_remap[i]:proj_info[i] for i in proj_info}
proj_info

{'Project ID': '069848',
 'Project Control Number': '511143',
 'Project Name': 'P530 Aircraft Intermediate Maintenance Facility, Kadena AB',
 'Review ID': '235411',
 'Review Name': 'Concept Design Resubmittal'}

In [20]:
df = pd.DataFrame(proj_info.items())
df.columns = ['Parameter', 'Value']
df

Unnamed: 0,Parameter,Value
0,Project ID,069848
1,Project Control Number,511143
2,Project Name,P530 Aircraft Intermediate Maintenance Facilit...
3,Review ID,235411
4,Review Name,Concept Design Resubmittal


In [21]:
df.to_excel(proj_info_output, index=False, header=False)

## Comment Extraction

In [22]:
comments_element = root[comments_index]
print(comments_element.tag, 
      '"' + comments_element.text + '"', 
      len(comments_element.text), 
      comments_element.attrib)

Comments "
					" 6 {}


It appears that there are no attributes for the Comments element, which is just a collection element, also the "text" element has a bunch of garbage spaces, which are remnents of the export from ProjNet's website.

In [23]:
child_count = len(comments_element)
child_count

114

In [24]:
def child_count(element):
    return len(element)

In [25]:
print(child_count(comments_element))

114


### Extracting Comment Elements
There are two main approaches:
- Tranverse the elements
- Extract using XPATH

First, we need to understand what subelements are included in a Comment

In [26]:
a_comment = comments_element[0]
print(child_count(a_comment))

16


In [27]:
all_tags = [child.tag.lower() for child in a_comment]
all_tags

['id',
 'spec',
 'sheet',
 'detail',
 'critical',
 'commenttext',
 'attachment',
 'docref',
 'createdby',
 'createdon',
 'status',
 'discipline',
 'doctype',
 'coordinatingdiscipline',
 'evaluations',
 'backchecks']

In [28]:
other_tags = []

In [29]:
for comment in comments_element:
    for item in comment:
        if item.tag.lower() not in all_tags:
            other_tags.append(item.tag.lower())

In [30]:
other_tags

[]

**NOTE:**

I am trying to extract the possible top level items from any given comment. So, I pull the tags from the first comment, but scan through the rest of the comments to see if there are any other tags I missed. If other_tags is not empty, then I add that element to the first comment and iterate until empty.

**TODO:**

I need to go to ProjNet and input comments to see what other possible element tags may be possible and include those as well... this also means, not just reviewing the XML, but also reviewing the HTML from the BidderInquiry module

### Experiment iter('comment')

In [31]:
#for comment in comments_element.iter('comment'):
#    print(comment.find('id'))

In [32]:
all_comments = comments_element.findall('comment')
print(type(all_comments)==list)

True


In [33]:
for comment in all_comments:
    comment_id = comment.find('id').text
    #print(comment_id)

In [34]:
#import dxr_classes as dxrc
from dxr_classes import Comment, Comments

In [35]:
#for comment in all_comments:
#    #new_comment = Comment(id=comment.find('id').text)
#    new_comment = Comment.from_element(comment)
#    #new_comment.ID()

In [36]:
new_comment = Comment.from_element(all_comments[0])
new_comment.author

'Yat Hung Chan'

In [37]:
print(new_comment.date)
type(new_comment.date)

2024-04-02 11:51:00


datetime.datetime

In [38]:
new_comment.evaluations

<Element 'evaluations' at 0x0000025842E5ABB0>

In [39]:
#new_comment.print_all()

In [40]:
#new_comment.dump()

In [41]:
comment_list = []
for comment in all_comments:
    this_comment = Comment(comment)
    if this_comment not in comment_list:
        comment_list.append(this_comment)

convo = Comments(comment_list)

In [42]:
convo.count()

114

In [43]:
convo.dump()

[{'id': '10732282',
  'spec': 'n/a',
  'sheet': None,
  'detail': None,
  'critical': 'No',
  'text': 'Page 927-934 and 944-951, Room Information Entered Values\n\nPlease provide justification why the People Based Rates are 0.0 cfm/person and the Area Based Rate are 0.0 cfm/ft2.',
  'attachment': None,
  'docref': None,
  'author': 'Yat Hung Chan',
  'date': datetime.datetime(2024, 4, 2, 11, 51),
  'status': 'Closed',
  'discipline': 'Mechanical',
  'coordinatingdiscipline': 'Structural',
  'doctype': 'Design Analysis',
  'evaluations': <Element 'evaluations' at 0x0000025842E5ABB0>,
  'backchecks': <Element 'backchecks' at 0x0000025842E5AF70>},
 {'id': '10732362',
  'spec': 'n/a',
  'sheet': None,
  'detail': None,
  'critical': 'No',
  'text': 'Please call out what kinds of piping on P-100, for examples SS, R,...etc.',
  'attachment': None,
  'docref': None,
  'author': 'Yat Hung Chan',
  'date': datetime.datetime(2024, 4, 2, 13, 47),
  'status': 'Open',
  'discipline': 'Plumbing',
  

In [44]:
df = pd.DataFrame(convo.dump())

In [45]:
dump_path = 'comments_dump.xlsx'
df.to_excel(dump_path, index=False)