In [1]:
#import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data
import numpy as np 
import os

In [2]:
import xml.etree.ElementTree as ET
import pprint
pp = pprint.PrettyPrinter(indent=4)
tree = ET.parse('./data/raw/all2.xml')
root = tree.getroot()
#print(root)

In [3]:
xml_models = root[0]
model_classes = []

for m in xml_models:
    model_classes.append(m.attrib["modeltype"])
    
model_classes = list(set(model_classes))

num_model_classes = len(model_classes)

def get_model_class(model):
    return model_classes.index(model.attrib["modeltype"])


#pp.pprint(model_classes)
#print(len(model_classes))

In [5]:
xml_models = root[0]
model_data = []
node_classes = []
edge_classes = []

for m in xml_models:
    instances = [el for el in m if el.tag == "INSTANCE"]
    connectors = [el for el in m if el.tag == "CONNECTOR"]
    
    for instance in instances:
        node_class = instance.attrib["class"]
        node_classes.append(node_class)
    
    for connector in connectors:
        edge_type = next(filter(lambda attr: attr.get("name") == "Type", connector.findall("ATTRIBUTE"))).text
        if(edge_type is None):
            edge_type = "none"
        edge_classes.append(edge_type.lower())
        

node_classes = list(set(node_classes))
edge_classes = list(set(edge_classes))

num_node_classes = len(node_classes)
num_edge_classes = len(edge_classes)


#pp.pprint(node_classes)
#pp.pprint(num_node_classes)
#pp.pprint(edge_classes)

In [6]:
class EnterpriseModelDatasetNCPerModeltype(Dataset):
    def __init__(self, root, filename, test=False, transform=None, pre_transform=None):
        self.test = test
        self.filename = filename
        self.num_classes = num_node_classes
        super(EnterpriseModelDatasetNCPerModeltype, self).__init__(root, transform, pre_transform)
        
    @property
    def raw_file_names(self):
        return self.filename

    @property
    def processed_file_names(self):
        return "unimplemented.pt"
        if self.test:
            return [f'data_test_{i}.pt' for i in range(len(xml_models))]
        else:
            return [f'data_{i}.pt' for i in range(len(xml_models))]

    def download(self):
        pass

    def process(self):
        xml_models = root[0]
        index = 0
        
        
        for m in xml_models:
            model = {}
            nodes = []
            edges = []
            adjacency_list = []
            y = []

            nodes_data = []
            edges_data = []
            

            instances = [el for el in m if el.tag == "INSTANCE"]
            connectors = [el for el in m if el.tag == "CONNECTOR"]

        
            for instance in instances:
                node = {}
                node_class = instance.attrib["class"]
                node_name = instance.attrib["name"]
                node["class"] = node_class
                node["name"] = node_name
                nodes_data.append(node)
            
                nodes.append([node_classes.index(node_class)])
                node_y = node_classes.index(node_class)
                y.append(torch.tensor(node_y, dtype=torch.int64))
                
                
                
        
            for connector in connectors:
                edge = {}
                edge_type = next(filter(lambda attr: attr.get("name") == "Type", connector.findall("ATTRIBUTE"))).text
                if(edge_type is None):
                    edge_type = "none"
                else:
                    edge_type = edge_type.lower()
                edge["type"] = edge_type

                connector_from = connector.find("FROM").get("instance")
                connector_to = connector.find("TO").get("instance")
                edge["from"] = connector_from
                edge["to"] = connector_to
                edges_data.append(edge)

                from_index = [node_data["name"] for node_data in nodes_data].index(connector_from)
                to_index = [node_data["name"] for node_data in nodes_data].index(connector_to)
                
                # Dataset relevant
                adjacency_list.append([from_index, to_index])
                edges.append([edge_classes.index(edge_type)])
                
                
        
            model["nodes"] = torch.tensor(nodes, dtype=torch.float)
            model["edges"] = torch.tensor(edges, dtype=torch.float)
            model["adjacency"] = torch.tensor(adjacency_list, dtype=torch.int64)
            #model["y"] = torch.tensor(y, dtype=torch.float)
            model["y"] = torch.tensor(y, dtype=torch.int64)
            

            model["nodes_data"] = nodes_data
            model["edges_data"] = edges_data
            
            proba_0 = 0.75
            train_mask = np.random.choice([True, False], size=len(model["nodes"]), p=[proba_0, 1-proba_0])
            test_mask = np.asarray([not val for val in train_mask])
            
            
         # Create data object
            data = Data(x=model["nodes"], 
                        edge_index= model["adjacency"].t().contiguous(),
                        edge_attr=model["edges"],
                        y=model["y"],
                        train_mask=torch.tensor(train_mask),
                        test_mask=torch.tensor(test_mask),
                        modeltype=get_model_class(m)
                        )
            
            
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))
            
            index += 1

        
    def len(self):
        return len(xml_models)

    def get(self, idx):
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))   
        return data

In [7]:
dataset_nc_per_modeltype = EnterpriseModelDatasetNCPerModeltype(root="./data/nc_data_per_model", filename="./raw/all2.xml")

Processing...
Done!


In [8]:
per_model = {"info": {}, "data": {}}
for i in range(0, num_model_classes):
    per_model["data"][model_classes[i]] = []
    per_model["info"][model_classes[i]] = 0
for data in dataset_nc_per_modeltype:
    per_model["data"][model_classes[data.modeltype]].append(data)
    per_model["info"][model_classes[data.modeltype]] += 1

In [10]:
print("DATASET LOADED: USE per_model variable !!!")
print(dataset_nc_per_modeltype)

DATASET LOADED: USE per_model variable !!!
EnterpriseModelDatasetNCPerModeltype(110)
