In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
#insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '/Users/Zacharias/Uni/ETH/FS2019/THESIS/code/ProGraML')
!pwd


/Users/Zacharias/Uni/ETH/FS2019/THESIS/code/ProGraML/deeplearning/ml4pl/poj104


In [2]:
from pathlib import Path
import torch
from torch_geometric.data import Data, DataLoader, InMemoryDataset
import numpy as np
import pickle
import tqdm

In [3]:
from google.protobuf import text_format
from deeplearning.ml4pl.graphs import programl
from deeplearning.ml4pl.graphs.labelled import graph_tuple
from labm8.py import app

#FLAGS = app.FLAGS


In [4]:
from deeplearning.ml4pl.graphs.unlabelled.llvm2graph import graph_builder

builder = graph_builder.ProGraMLGraphBuilder()

# Graph Tuple Legend


#### ```adjacencies: np.array```

A list of adjacency lists, one for each flow type, where an entry in an
adjacency list is a <src,dst> tuple of node indices.

`Shape (edge_flow_count, edge_count, 2), dtype int32`


#### ```edge_positions: np.array```

A list of edge positions, one for each edge type. An edge position is an
integer in the range 0 <= x < edge_position_max.

`Shape (edge_flow_count, edge_count), dtype int32`



#### ```node_x: np.array```

A list of node feature arrays. Each row is a node, and each column is an
feature for that node.

`Shape (node_count, node_x_dimensionality), dtype int32`


#### ```node_y: Optional[np.array] = None```

(optional) A list of node labels arrays.

`Shape (node_count, node_y_dimensionality), dtype float32`


#### ```graph_x: Optional[np.array] = None```

(optional) A list of graph features arrays.

`Shape (graph_x_dimensionality) OR (graph_count, graph_x_dimensionality) if
graph_count > 1, dtype int32`



#### ```graph_y: Optional[np.array] = None```

(optional) A vector of graph labels arrays.

`Shape (graph_y_dimensionality) OR (graph_count, graph_y_dimensionality) if
graph_count > 1, dtype float32`


## Disjoint graph properties

#### ```disjoint_graph_count: int = 1```

The number of disconnected graphs in the tuple.


#### ```disjoint_nodes_list: np.array = None```

A list of integers which segment the nodes by graph. E.g. with a GraphTuple
of two distinct graphs, both with three nodes, nodes_list will be
[0, 0, 0, 1, 1, 1].
`Shape (node_count), dtype int32:`


In [5]:
# proto example
from google.protobuf import text_format
from deeplearning.ml4pl.graphs import programl_pb2 as proto

program_graph = proto.ProgramGraph()

with open('40.txt_9G8XzpcFlK.programl_proto', 'r') as f:
    proto = f.read()

proto = text_format.Parse(proto, program_graph)

In [6]:
# from .ll example
with open('71.ll', 'r') as f:
    ll = f.read()

nx_graph = builder.Build(ll)

In [7]:
# tuple example
graph_tup = graph_tuple.GraphTuple.CreateFromNetworkX(nx_graph)
print([a for a in dir(graph_tup) if '__' not in a])
print('\n')
print('edge_positions', [x.shape for x in graph_tup.edge_positions])

['CreateFromNetworkX', 'CreateFromProgramGraph', 'FromFile', 'FromGraphTuples', 'SetFeaturesAndLabels', 'ToFile', 'ToGraphTuples', 'ToNetworkx', '_asdict', '_field_defaults', '_field_types', '_fields', '_fields_defaults', '_make', '_replace', 'adjacencies', 'call_edge_count', 'control_edge_count', 'count', 'data_edge_count', 'disjoint_graph_count', 'disjoint_nodes_list', 'edge_count', 'edge_position_max', 'edge_positions', 'graph_x', 'graph_x_dimensionality', 'graph_y', 'graph_y_dimensionality', 'has_graph_x', 'has_graph_y', 'has_node_y', 'index', 'is_disjoint_graph', 'node_count', 'node_x', 'node_x_dimensionality', 'node_y', 'node_y_dimensionality']


edge_positions [(140,), (269,), (9,)]


In [8]:
# data example
edge_indices = [torch.from_numpy(a).to(dtype=torch.long) for a in graph_tup.adjacencies] # list of <M_i, 2>
edge_types = [i * np.ones_like(edge_indices[i])[:,0] for i in range(len(edge_indices))]
edge_attr = np.hstack(edge_types)
print('edge_attr', edge_attr.shape)

t_attr = torch.from_numpy(edge_attr).to(dtype=torch.long).view(-1,1)
print(t_attr.size())
print('edge_indices', [x.shape for x in edge_indices])
print('edge_types', [x.shape for x in edge_types])

#edge_pos = [torch.from_numpy(a).to(dtype=torch.long) for a in graph_tup.edge_positions]
print('graph_tup.edge_positions', [x.shape for x in graph_tup.edge_positions])
edge_pos = np.hstack(graph_tup.edge_positions)

edge_attr = np.vstack([edge_attr, edge_pos])

edge_attr = torch.from_numpy(edge_attr.T).to(torch.long)
print('edge_attr', edge_attr.size())

# node_x as data.x
x = torch.from_numpy(graph_tup.node_x).to(torch.long) #
y = torch.from_numpy(np.array(42)).to(torch.long).view(1)
print('x', x.size())
print('y', y.size())

edge_attr (418,)
torch.Size([418, 1])
edge_indices [torch.Size([140, 2]), torch.Size([269, 2]), torch.Size([9, 2])]
edge_types [(140,), (269,), (9,)]
graph_tup.edge_positions [(140,), (269,), (9,)]
edge_attr torch.Size([418, 2])
x torch.Size([250, 1])
y torch.Size([1])


In [9]:
def tuple2data(graph_tup, class_label):
    # edges as data.edge_index
    # list of <M_i, 2> tensors  (M_i = num_edges for ith edge type)
    edge_indices = [torch.from_numpy(a).to(dtype=torch.long) for a in graph_tup.adjacencies] # list of <M_i, 2>

    edge_index = torch.cat(edge_indices, dim=0).t().contiguous() # <2, M>

    # (edge_type, edge_position) as data.edge_attr of shape <M, 2>
    edge_types = [i * np.ones_like(edge_indices[i])[:,0] for i in range(len(edge_indices))]
    edge_types = np.hstack(edge_types)
    edge_pos = np.hstack(graph_tup.edge_positions)
    edge_attr = np.vstack([edge_types, edge_pos]).T
    
    edge_attr = torch.from_numpy(edge_attr).to(dtype=torch.long).contiguous()  # <M, 2>
    
    assert edge_attr.size()[0] == edge_index.size()[1], f'edge_attr={edge_attr.size()} size mismatch with edge_index={edge_index.size()}'
    
    # node_x as data.x
    x = torch.from_numpy(graph_tup.node_x).to(torch.long)  # <N, 1>
    # class label as y
    y = torch.from_numpy(np.array(class_label)).to(torch.long).view(1)  # <1>

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    return data

In [10]:
# tuple2data example
tuple2data(graph_tup, 42)

Data(edge_attr=[418, 2], edge_index=[2, 418], x=[250, 1], y=[1])

In [11]:
from multiprocessing import Pool
import tqdm
import os

def dump(outfile, data, mkdir=True):
    if mkdir:
        outfile.parent.mkdir(exist_ok=True, parents=True)
    with open(outfile, 'wb') as f:
        pickle.dump(data, f)

def load(file):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    return data

In [12]:
def preprocess_ds_raw_dir(ds_base, dump_nx=True, dump_tuple=True, dump_data=True):
    """Preprocess all .ll files in folders named by class ids e.g. ds_base/1, ds_base/2, ...
    into pytorch-geometric data.Data instances.
    
    The intermediate nx graphs and graph_tuples can be saved as well.
    """

    out_base = ds_base.parent / (ds_base.name + '_programl')
    out_base.mkdir(parents=True, exist_ok=True)
    problems = open(out_base / 'problems.txt', 'a')
    print(f"=== DATASET {ds_base}: preprocessing will be saved in {out_base}")

    # get all subfolders 1/ 2/ etc
    folders = [x for x in ds_base.glob('*') if x.is_dir()]

    for i, folder in enumerate(folders):
        try:
            int(folder.name)
        except ValueError as e:
            print(f"Folder {i} has to be named with integervalues, but is {folder.name}.")
            raise e
    
    # multiprocessed loop over folders
    pool = Pool(processes=12)
    task_args = [(folder, dump_nx, dump_tuple, dump_data, out_base) for folder in folders]
    
    for probs in tqdm.tqdm(pool.imap(_process_single_folder, task_args), total=len(task_args)):
        print(probs, file=problems)
    
    pool.close()
    pool.join()

    problems.close()
    print(f" * COMPLETED * === DATASET {ds_base}: preprocessing saved to {out_base}")
    
    

In [14]:
def _process_single_folder(args):
    folder, dump_nx, dump_tuple, dump_data, out_base = args
    problems = ""
    
    print(f"=== Opening Folder {str(folder)} ===")
    
    label = int(folder.name)
    files = list(folder.glob('*.ll'))
    for i, file in enumerate(files):
        
        # ~~~ step 1: .ll --> nx ~~~
        
        outfile = out_base / '_nx' / folder.name / (file.name.rsplit('.', 1)[0] + '.nx.p')
        if outfile.is_file():
            nx_graph = load(outfile)
        else:
            if i % 100 == 0:
                print(f"{folder.name} - [{i}/{len(files)}] Processing {str(file)} ...")
            with open(file, 'r') as f:
                bytecode = f.read()
            try:
                nx_graph = builder.Build(bytecode) # nx
                if dump_nx:
                    dump(outfile, nx_graph)
            except:
                print(f"***** FAILING ON {str(file)} .................. ")
                problems += str(file)
                problems += '\n'
                continue


        # ~~~ step 2: nx --> tuple ~~~
        
        outfile = out_base / '_tuples' / folder.name / (file.name.rsplit('.', 1)[0] + '.tuple.p')
        if outfile.is_file():
            graph_tup = load(outfile)
        else:
            graph_tup = graph_tuple.GraphTuple.CreateFromNetworkX(nx_graph)
            if dump_tuple:
                dump(outfile, graph_tup)

        # step 3: tuple --> data
        outfile = out_base / folder.name / (file.name.rsplit('.', 1)[0] + '.data.p')
        if outfile.is_file():
            continue
        data = tuple2data(graph_tup, class_label=label)
        dump(outfile, data)

    return problems

In [None]:
ds_base = Path('classifyapp_data/ir_val')
print(ds_base.name)

preprocess_ds_raw_dir(ds_base)

0

In [None]:
#class POJ104(InMemoryDataset):
#    torch_geometric.data.InMemoryDataset

In [None]:
ds_base = Path('classifyapp_data/ir_test')
print(ds_base.name)

preprocess_ds_raw_dir(ds_base)

In [None]:
preprocess_ds_raw_dir(Path('classifyapp_data/ir_train'))