# Dataset generation: ProGraML x Branch Prediction <a class='tocSkip'>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Configs" data-toc-modified-id="Configs-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Configs</a></span></li><li><span><a href="#setup-and-download" data-toc-modified-id="setup-and-download-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>setup and download</a></span></li><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Preprocessing</a></span><ul class="toc-item"><li><span><a href="#nx2data" data-toc-modified-id="nx2data-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>nx2data</a></span></li><li><span><a href="#define-preprocessing-funcs" data-toc-modified-id="define-preprocessing-funcs-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>define preprocessing funcs</a></span></li><li><span><a href="#Execute-preprocessing-of-.ll" data-toc-modified-id="Execute-preprocessing-of-.ll-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>Execute preprocessing of .ll</a></span></li></ul></li><li><span><a href="#investigate" data-toc-modified-id="investigate-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>investigate</a></span></li></ul></div>

# Configs

In [1]:
# Set editor width to something sane
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
# get root repository path
a = !pwd
REPO_ROOT = a[0].rsplit('ProGraML', maxsplit=1,)[0] + 'ProGraML'
print(REPO_ROOT)
#insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, REPO_ROOT)

/home/zacharias/ProGraML


In [3]:
from pathlib import Path
import pickle

import numpy as np
from matplotlib import pyplot as plt
import networkx as nx
import tqdm
import torch
from torch_geometric.data import Data, DataLoader, InMemoryDataset
import torch_geometric

# setup and download

In [7]:

dataset_name = 'branch_prediction'

# Set where to store the dataset and download automagically
ds_basepath = Path('/home/zacharias/llvm_datasets/')

logs_basepath = ds_basepath / 'logs' / f'{dataset_name}_logs'

ds_basepath.mkdir(parents=True, exist_ok=True)
ds_path = ds_basepath / f'{dataset_name}_data'
ds_path.mkdir(parents=True, exist_ok=True)
logs_basepath.mkdir(parents=True, exist_ok=True)

In [8]:
# link those places into poj104 folder

data_source = str((ds_basepath / f'{dataset_name}_data').absolute())

print(data_source)
data_target = REPO_ROOT + '/deeplearning/ml4pl/poj104/'
print(data_target)

logs_source = str(logs_basepath.absolute())
print(logs_source)
logs_target = REPO_ROOT + '/deeplearning/ml4pl/poj104/'
print(logs_target)

/home/zacharias/llvm_datasets/branch_prediction_data
/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/
/home/zacharias/llvm_datasets/logs/branch_prediction_logs
/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/


In [9]:
! ln -s {data_source} {data_target}
! ln -s {logs_source} {logs_target}
! ls -lah {str(REPO_ROOT + '/deeplearning/ml4pl/poj104')} | grep {dataset_name}

lrwxrwxrwx  1 zacharias zacharias   63 Feb 13 15:41 new_unsupervised_ncc_data -> /mnt/data/llvm/master_thesis_datasets/new_unsupervised_ncc_data
lrwxrwxrwx  1 zacharias zacharias   68 Feb 13 15:41 new_unsupervised_ncc_logs -> /mnt/data/llvm/master_thesis_datasets/logs/new_unsupervised_ncc_logs


# Preprocessing

## nx2data

In [14]:
def nx2data(nx_graph, class_label=None):
    r"""Converts a :obj:`networkx.Graph` or :obj:`networkx.DiGraph` to a
    :class:`torch_geometric.data.Data` instance.

    Args:
        G (networkx.Graph or networkx.DiGraph): A networkx graph.
    """

    # make sure the nx_graph is encoded properly (since node.x used to be buggy!)
    # encoder = GraphNodeEncoder()
    # encoder.EncodeNodes(nx_graph)

    # collect edge_index
    edge_index = torch.tensor(list(nx_graph.edges())).t().contiguous()

    # collect edge_attr
    positions = []
    flows = []

    for i, (_, _, edge_data) in enumerate(nx_graph.edges(data=True)):
        positions.append(edge_data['position'])
        flows.append(edge_data['flow'])

    positions = torch.tensor(positions)
    flows = torch.tensor(flows)

    edge_attr = torch.cat([flows, positions]).view(2, -1).t().contiguous()
    
    # collect x
    types = []
    xs = []
    
    for i, node_data in nx_graph.nodes(data=True):
        types.append(node_data['type'])
        xs.append(node_data['x'][0])

    xs = torch.tensor(xs)
    types = torch.tensor(types)
    
    x = torch.cat([xs, types]).view(2, -1).t().contiguous()

    
    assert edge_attr.size()[0] == edge_index.size()[1], f'edge_attr={edge_attr.size()} size mismatch with edge_index={edge_index.size()}'
    
    if class_label is not None:
        y = torch.tensor(int(class_label)).view(1)  # <1>
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    else:
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    
    print(data)
    return data

## define preprocessing funcs

In [11]:
from multiprocessing import Pool
import tqdm
import os

import traceback
import logging

def dump(outfile, data, mkdir=True):
    if mkdir:
        outfile.parent.mkdir(exist_ok=True, parents=True)
    with open(outfile, 'wb') as f:
        pickle.dump(data, f)

def load(file):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    return data

In [12]:
def _process_single_folder(args, rename_on_fail=False):
    """The new version will skip tuple creation completely."""
    folder, dump_nx, dump_data, out_base = args
    
    problems = ""
    num_problems = 0
    num_processed = 0
    
    files = list(folder.glob('*.ll'))
    
    print(f"=== Opening Folder {str(folder)} with {len(files)} ===")
    
    
    # iterate over all .ll files in folder and confirm and respectively create the .nx.p and .data.p files
    for i, file in enumerate(files):
        outfile_nx = out_base / '_nx' / folder.name / (file.name.rsplit('.', 1)[0] + '.nx.p')
        outfile_data = out_base / folder.name / (file.name.rsplit('.', 1)[0] + '.data.p')

        # find out where to start processing

        # skip entirely?
        if outfile_data.is_file():
            continue

        # start at step 2: nx --> data ?
        if outfile_nx.is_file():
            nx_graph = load(outfile_nx)            
            data = nx2data(nx_graph, class_label=None)
            dump(outfile_data, data)
            continue
        
        # start in the beginning:
        # ~~~ step 1: .ll --> nx ~~~
        #if i % 100 == 0:
        if i % 1 == 0:
            print(f"{folder.name} - [{i}/{len(files)}] Processing {str(file)} ...")
        
        with open(file, 'r') as f:
            bytecode = f.read()

        try:
            nx_graph = builder.Build(bytecode) # nx
            if dump_nx:
                dump(outfile_nx, nx_graph)
            num_processed += 1
        except Exception as e:
            num_problems += 1
            num_processed += 1
            
            if rename_on_fail:
                print(f"***** FAILING ON {str(file)} ... renaming file to .ll_ ")
                problems += str(file)
                problems += '\n'
                file.rename(file.with_suffix('.ll_'))
            else:
                print(f"***** FAILING ON {str(file)} ...")
                problems += str(file)
                problems += '\n'
            logging.error(traceback.format_exc())
            continue

        # step 2: nx --> data
        data = nx2data(nx_graph, class_label=None)
        dump(outfile_data, data)
    summary = f"### problems in {num_problems}/{num_processed} files in {str(folder)} ###\n"
    problems += summary
    print(summary)
    return problems

In [13]:
def preprocess_raw_dir(ds_base, dump_nx=True, dump_data=True, pool_size=12):
    """Preprocess all .ll files in subfolders of ds_base recursively,
    saving .data and .nx pickles to file in {ds_base}_programl/ and in {ds_base}_programl/_nx/
    in a directory structure that mirrors that of ds_base.
    """
    assert ds_base.exists(), "Folder " + ds_base + " does not exist."

    # adapt path type
    if type(ds_base) == str:
        ds_base = Path(ds_base)
        
    # create folder for outputs
    out_base = ds_base.parent / (ds_base.name + '_programl')
    out_base.mkdir(parents=True, exist_ok=True)
    
    # open file to record failed preprocessing attempts
    problems = open(out_base / 'problems.txt', 'a')
    print(f"=== DATASET {ds_base}: preprocessing will be saved in {out_base}")

    # get all subfolders that (directly) contain .ll files in ds_base by DFS    
    folders_raw = list()
    listing_to_explore = [p for p in ds_base.iterdir() if p.is_dir()]
    while len(listing_to_explore) > 0:
        f = listing_to_explore.pop()
        listing_to_explore.extend([p for p in f.iterdir() if p.is_dir()])
        f_contents = f.iterdir()
        for file in f_contents:
            # keep folder if it contains raw .ll files
            if file.suffix == '.ll':
                folders_raw.append(f)
                break
    print(f"preprocessing {len(folders_raw)} subfolders...")

    
    # multiprocessed loop over folders
    if pool_size != 1:
        pool = Pool(processes=pool_size)
        task_args = [(folder, dump_nx, dump_data, out_base) for folder in folders_raw]
    
        for probs in tqdm.tqdm(pool.imap_unordered(_process_single_folder, task_args), total=len(task_args)):
            if len(probs) > 15: # don't print empty strings like '\n\n\n'
                print(probs, file=problems)
    else:
        task_args = [(folder, dump_nx, dump_data, out_base) for folder in folders_raw]
    
        for args in tqdm.tqdm(task_args):
            probs = _process_single_folder(args)
            if len(probs) > 15: # don't print empty strings like '\n\n\n'
                print(probs, file=problems)
          
          
    pool.close()
    pool.join()

    problems.close()
    print(f" * COMPLETED * === DATASET {ds_base}: preprocessing saved to {out_base}")

## Execute preprocessing of .ll

In [14]:
# process .ll data from 'amd_app_sdk'

process_dir = ds_path #/ 'amd_app_sdk'
print(f"Processing {process_dir}")

Processing /mnt/data/llvm/master_thesis_datasets/new_unsupervised_ncc_data


In [None]:
preprocess_raw_dir(process_dir, pool_size=16)

In [None]:
# comments:
### problems in 0/63 files in /mnt/data/llvm/master_thesis_datasets/unsupervised_ncc_data/amd_app_sdk/amd_ocl ###

In [None]:
file = Path('/mnt/data/llvm/master_thesis_datasets/unsupervised_ncc_data/eigen/eigen_matmul_3/eigen_matmul-266.ll_')
file.exists()

In [None]:
with open(file, 'r') as f:
    ll = f.read()

In [None]:
builder.Build(ll)

# investigate

In [77]:
# single file

g = load('/home/zacharias/llvm_datasets/test_br_pr/bin/bt.A.ll.pickle')
#g = load('/home/zacharias/phd/deeplearning/ml4pl/poj104/test.')

In [78]:
keys = {}
for i,(n, d) in enumerate(g.nodes(data=True)):
        #if d['type'] == 0:
        for key in d.keys():
            if key not in keys:
                keys[key] = True
                #print(list(d.keys()))
        #if 'llvm_profile_true_weight' in list(d.keys()):
        if len(list(d.keys())) > 5:
            #print(d)
            print(d['llvm_profile_true_weight'], d['llvm_profile_false_weight'], d['llvm_profile_total_weight'])
        #if i > 50: break

print('\n\n')
print(keys)

2 1 3
2 1 3
19 8 27
2 201 203
2494 36290 38784
201 2 203
2 1 3
2 1 3
129 3 132
8193 129 8322
524289 8193 532482
3 129 132
129 8193 8322
8193 524289 532482
8193 129 8322
524289 8193 532482
524289 8193 532482
8193 129 8322
129 3 132
2147483648 0 2147483648
8193 129 8322
8193 129 8322
129 3 132
2147483648 0 2147483648
8193 129 8322
8193 129 8322
129 3 132
2147483648 0 2147483648
8193 129 8322
8193 129 8322
129 3 132
2147483648 0 2147483648
8193 129 8322
8193 129 8322
129 3 132
129 3 132
8193 129 8322
8193 129 8322
129 3 132
2147483648 0 2147483648
8193 129 8322
8193 129 8322
129 3 132
2317933 11589661 13907594
65 2 67
4097 65 4162
262145 4097 266242
2 65 67
65 4097 4162
4194304 2143289344 2147483648
63 3845 3908
246017 3845 249862
246017 3845 249862
1064949 2146418699 2147483648
3845 238329 242174
3845 222953 226798
3845 222953 226798
3845 63 3908
63 2 65
0 2147483648 2147483648
63 3845 3908
246017 3845 249862
246017 3845 249862
1064949 2146418699 2147483648
3845 238329 242174
3845 222953

In [75]:
g.graph

{'x': [],
 'y': [],
 'llvm_profile_num_functions': 10,
 'llvm_profile_max_function_count': 65556,
 'llvm_profile_num_counts': 48,
 'llvm_profile_total_count': 1121842441,
 'llvm_profile_max_count': 536870912,
 'llvm_profile_max_internal_count': 536870912}

In [37]:
# whole folder at once

keys = {}

for file in Path('/home/zacharias/llvm_datasets/branch_prediction_data/').rglob('*ll.pickle'):
    g = load(file)
    for i,(n, d) in enumerate(g.nodes(data=True)):
        #if d['type'] == 0:
        for key in d.keys():
            if key not in keys:
                keys[key] = True
                print(list(d.keys()))
        #print(d)
        #if i > 50: break

print('\n\n')
print(keys)

['type', 'text', 'preprocessed_text', 'function', 'x', 'y']
['type', 'text', 'preprocessed_text', 'function', 'x', 'y']
['type', 'text', 'preprocessed_text', 'function', 'x', 'y']
['type', 'text', 'preprocessed_text', 'function', 'x', 'y']
['type', 'text', 'preprocessed_text', 'function', 'x', 'y']
['type', 'text', 'preprocessed_text', 'function', 'x', 'y']



{'type': True, 'text': True, 'preprocessed_text': True, 'function': True, 'x': True, 'y': True}


In [None]:
def nx2data(nx_graph, class_label=None):
    r"""Converts a :obj:`networkx.Graph` or :obj:`networkx.DiGraph` to a
    :class:`torch_geometric.data.Data` instance.

    Args:
        G (networkx.Graph or networkx.DiGraph): A networkx graph.
    """

    # make sure the nx_graph is encoded properly (since node.x used to be buggy!)
    # encoder = GraphNodeEncoder()
    # encoder.EncodeNodes(nx_graph)

    # collect edge_index
    edge_index = torch.tensor(list(nx_graph.edges())).t().contiguous()

    # collect edge_attr
    positions = []
    flows = []

    for i, (_, _, edge_data) in enumerate(nx_graph.edges(data=True)):
        positions.append(edge_data['position'])
        flows.append(edge_data['flow'])

    positions = torch.tensor(positions)
    flows = torch.tensor(flows)

    edge_attr = torch.cat([flows, positions]).view(2, -1).t().contiguous()
    
    # collect x
    types = []
    xs = []
    
    for i, node_data in nx_graph.nodes(data=True):
        types.append(node_data['type'])
        xs.append(node_data['x'][0])

    xs = torch.tensor(xs)
    types = torch.tensor(types)
    
    x = torch.cat([xs, types]).view(2, -1).t().contiguous()

    
    assert edge_attr.size()[0] == edge_index.size()[1], f'edge_attr={edge_attr.size()} size mismatch with edge_index={edge_index.size()}'
    
    if class_label is not None:
        y = torch.tensor(int(class_label)).view(1)  # <1>
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    else:
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    
    print(data)
    return data