In [24]:
# importing element tree
# under the alias of ET
import xml.etree.ElementTree as ET
 
# Passing the path of the
# xml document to enable the
# parsing process
tree = ET.parse('data/raw/all.xml')
 
# getting the parent tag of
# the xml document
root = tree.getroot()
 
# printing the root (parent) tag
# of the xml document, along with
# its memory location
print(root)

import pprint
pp = pprint.PrettyPrinter(indent=4)

<Element 'ADOXML' at 0x000002954AC09A40>


In [25]:
xml_models = root[0]
model_classes = []

for m in xml_models:
    model_classes.append(m.attrib["modeltype"])
    
model_classes = list(set(model_classes))

num_model_classes = len(model_classes)

def get_model_class(model):
    return model_classes.index(model.attrib["modeltype"])


pp.pprint(model_classes)
print(len(model_classes))

[   'Concepts Model',
    'Goal Model',
    'Business Rule Model',
    'Product-Service-Model',
    '4EM General Model',
    'Technical Components and Requirements Model',
    'Business Process Model',
    'Actors and Resources Model',
    'Goal & Business Rule Model',
    'Business Rule & Process Model']
10


In [26]:
xml_models = root[0]
model_data = []
node_classes = []
edge_classes = []

for m in xml_models:
    instances = [el for el in m if el.tag == "INSTANCE"]
    connectors = [el for el in m if el.tag == "CONNECTOR"]

    
    for instance in instances:
        node_class = instance.attrib["class"]
        node_classes.append(node_class)
    
    for connector in connectors:
        edge_type = next(filter(lambda attr: attr.get("name") == "Type", connector.findall("ATTRIBUTE"))).text
        if(edge_type is None):
            edge_type = "none"
        edge_classes.append(edge_type)
        
    
node_classes = list(set(node_classes))
edge_classes = list(set(edge_classes))

num_node_classes = len(node_classes)
num_edge_classes = len(edge_classes)

print(node_classes)
print(edge_classes)

['Role', 'Problem', 'Goal', 'Partial-PartOF', 'Information Set', 'Unspecific/Product/Service', 'Rule', 'IS Requirement', 'PartOF (OR)', 'Individual', 'Partial-ISA', 'AND/OR', 'Concept', 'Resource', 'Organizational Unit', 'OR', 'Total-PartOF', 'Total-ISA', 'PartOF (AND)', 'PartOF (XOR)', 'Join (AND)', 'Component', 'External Process', 'Opportunity', 'Split (AND)', 'Constraint', 'Feature', 'Comment', 'Attribute', 'Split (OR)', 'AND', 'Process', 'Join (OR)', 'IS Technical Component', 'Cause']
['hinders', 'n:m', 'Input', 'play', 'supports', 'uses', '1:1', 'requires', 'motivates', 'Causes', 'is responsible for', '1:n', 'Output', 'has requirement', 'Supports', 'Hinders', 'performs', 'none']


In [27]:
xml_models = root[0]
model_data = []

for m in xml_models:
    model = {}
    nodes = []
    edges = []
    adjacency_list = []
    
    nodes_data = []
    edges_data = []
    
    model["class"] = get_model_class(m)
    
    instances = [el for el in m if el.tag == "INSTANCE"]
    connectors = [el for el in m if el.tag == "CONNECTOR"]

    
    for instance in instances:
        node = {}
        node_class = instance.attrib["class"]
        node_name = instance.attrib["name"]
        node["class"] = node_class
        node["name"] = node_name
        nodes_data.append(node)
        
        nodes.append(node_classes.index(node_class))
        
    for connector in connectors:
        edge = {}
        edge_type = next(filter(lambda attr: attr.get("name") == "Type", connector.findall("ATTRIBUTE"))).text
        if(edge_type is None):
            edge_type = "none"
        edge["type"] = edge_type
        
        connector_from = connector.find("FROM").get("instance")
        connector_to = connector.find("TO").get("instance")
        edge["from"] = connector_from
        edge["to"] = connector_to
        
        from_index = [node_data["name"] for node_data in nodes_data].index(connector_from)
        to_index = [node_data["name"] for node_data in nodes_data].index(connector_to)
        
        adjacency_list.append([from_index, to_index])
        
        edges_data.append(edge)
        
        edges.append(edge_classes.index(edge_type))
    
    model["nodes"] = nodes
    model["edges"] = edges
    model["adjacency"] = adjacency_list
    
    model["nodes_data"] = nodes_data
    model["edges_data"] = edges_data
    
    model_data.append(model)
    

print([data["name"] for data in model_data[0]["nodes_data"]])

['ISTC 1', 'AND-20006', 'Goal - 21', 'Goal - 22', 'Goal - 23', 'Goal - 20', 'Problem - 16', 'ISTC 8', 'AND-20036', 'IS Requirement - 10', 'IS Requirement - 12', 'IS Requirement - 11', 'IS Requirement - 13', 'IS Requirement - 14', 'IS Requirement - 8', 'Goal - 32', 'Goal - 35', 'Goal - 28', 'Goal - 34', 'Problem - 25', 'IS Requirement - 7', 'Problem - 27', 'Problem - 26', 'IS Requirement - 9', 'Problem - 28', 'Goal - 30', 'Problem - 12', 'Problem - 21', 'Problem - 13', 'Problem - 18', 'Problem - 14', 'Problem - 22', 'Problem - 29', 'Goal - 29', 'Problem - 19', 'Problem - 17', 'Problem - 20', 'Problem - 23', 'ISTC 7', 'IS Requirement-20162', 'Goal - 31', 'Comment-11339']


In [28]:
print("MODEL CLASSES")
pp.pprint(model_classes)
print("COUNT: " + str(num_model_classes))
print()

print("NODE CLASSES")
pp.pprint(node_classes)
print("COUNT: " + str(num_node_classes))
print()

print("EDGE_CLASSES")
pp.pprint(edge_classes)
print("COUNT: " + str(num_edge_classes))
print()

MODEL CLASSES
[   'Concepts Model',
    'Goal Model',
    'Business Rule Model',
    'Product-Service-Model',
    '4EM General Model',
    'Technical Components and Requirements Model',
    'Business Process Model',
    'Actors and Resources Model',
    'Goal & Business Rule Model',
    'Business Rule & Process Model']
COUNT: 10

NODE CLASSES
[   'Role',
    'Problem',
    'Goal',
    'Partial-PartOF',
    'Information Set',
    'Unspecific/Product/Service',
    'Rule',
    'IS Requirement',
    'PartOF (OR)',
    'Individual',
    'Partial-ISA',
    'AND/OR',
    'Concept',
    'Resource',
    'Organizational Unit',
    'OR',
    'Total-PartOF',
    'Total-ISA',
    'PartOF (AND)',
    'PartOF (XOR)',
    'Join (AND)',
    'Component',
    'External Process',
    'Opportunity',
    'Split (AND)',
    'Constraint',
    'Feature',
    'Comment',
    'Attribute',
    'Split (OR)',
    'AND',
    'Process',
    'Join (OR)',
    'IS Technical Component',
    'Cause']
COUNT: 35

EDGE_CLASSE

In [7]:
pp.pprint(model_data[0])
#print(nodes.shape)

{   'adjacency': [   [1, 0],
                     [2, 1],
                     [3, 1],
                     [4, 1],
                     [5, 0],
                     [5, 7],
                     [6, 5],
                     [7, 14],
                     [7, 20],
                     [7, 23],
                     [8, 7],
                     [9, 8],
                     [10, 8],
                     [11, 8],
                     [12, 8],
                     [13, 8],
                     [15, 14],
                     [16, 23],
                     [17, 7],
                     [18, 20],
                     [19, 15],
                     [21, 18],
                     [22, 18],
                     [24, 16],
                     [25, 17],
                     [26, 17],
                     [27, 17],
                     [28, 17],
                     [29, 25],
                     [30, 25],
                     [31, 25],
                     [32, 17],
                     [33, 38],
   

In [8]:
print(len(model_data[0]["adjacency"]))
print(len(model_data[0]["edges"]))
print(len(model_data))

40
40
22


Fragen:

- welche Node und Edge Features könnten noch relevant sein?
    - in dem Zsmhang, wie viel Sinn würde es ergeben, ATTRIBUTE Description mit word2vec zu embedden. Mir scheinen die Beschreibungen nicht als wirklich aussagekräftig
- spielt 
    `<ATTRIBUTE name="Positions" type="STRING">EDGE 0 index:8</ATTRIBUTE>`
    eine Rolle, konnte nicht herausfinden, wofür das Attribut ist (CONNECTOR)
- es existieren Connectoren, die keinen expliziten Type haben:
    `<ATTRIBUTE name="Type" type="ENUMERATION"/>`

- Gibt es eventuell Übergangslösungen für Werkstudentenvertrag wenn ich exmatrikuliert werde

 
`<ATTRIBUTE name="Priority" type="ENUMERATION">Low</ATTRIBUTE>
<ATTRIBUTE name="Criticality" type="ENUMERATION">Low</ATTRIBUTE>
<ATTRIBUTE name="Description" type="LONGSTRING">Fehlende Registrierung (S.5)</ATTRIBUTE>`

Begründung PyTorch Nutzung
Argumentation zu Feature dokumenieren
Befristung Arbeitsvertrag nachschauen 