# Dataset generation: ProGraML x Devmap <a class='tocSkip'>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Configs" data-toc-modified-id="Configs-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Configs</a></span></li><li><span><a href="#Dev:-Helper-Functions" data-toc-modified-id="Dev:-Helper-Functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dev: Helper Functions</a></span><ul class="toc-item"><li><span><a href="#Dev:-nx2data-(more-flexible!)" data-toc-modified-id="Dev:-nx2data-(more-flexible!)-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Dev: nx2data (more flexible!)</a></span></li></ul></li><li><span><a href="#Main" data-toc-modified-id="Main-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Main</a></span><ul class="toc-item"><li><span><a href="#needs-action:" data-toc-modified-id="needs-action:-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span><em>needs action:</em></a></span></li><li><span><a href="#Process-devmap-.ll-files-(from-NCC-release)" data-toc-modified-id="Process-devmap-.ll-files-(from-NCC-release)-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Process devmap .ll files (from NCC release)</a></span></li><li><span><a href="#fetch-labels-etc.-from-csv" data-toc-modified-id="fetch-labels-etc.-from-csv-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>fetch labels etc. from csv</a></span></li></ul></li></ul></div>

# Configs

In [1]:
# Set editor width to something sane
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
# get root repository path
a = !pwd
REPO_ROOT = a[0].rsplit('ProGraML', maxsplit=1,)[0] + 'ProGraML'
print(REPO_ROOT)
#insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, REPO_ROOT)

/home/zacharias/ProGraML


In [3]:
from pathlib import Path
import pickle

import numpy as np
from matplotlib import pyplot as plt
import networkx as nx
import tqdm
import torch
from torch_geometric.data import Data, DataLoader, InMemoryDataset
import torch_geometric

In [4]:
#from google.protobuf import text_format
#from deeplearning.ml4pl.graphs import programl
#from deeplearning.ml4pl.graphs.labelled import graph_tuple
#from labm8.py import app

In [5]:
from deeplearning.ml4pl.graphs.unlabelled.llvm2graph import graph_builder

builder = graph_builder.ProGraMLGraphBuilder()

# Dev: Helper Functions
`
unreachable,232
ret void,263
!UNK,8564
!IDENTIFIER, 8565
magic/root/ I forgot, 8567
`

In [6]:
#nx_graph = builder.Build(ll)

## Dev: nx2data (more flexible!)

In [29]:
def nx2data(nx_graph, class_label=None):
    r"""Converts a :obj:`networkx.Graph` or :obj:`networkx.DiGraph` to a
    :class:`torch_geometric.data.Data` instance.

    Args:
        G (networkx.Graph or networkx.DiGraph): A networkx graph.
    """

    # make sure the nx_graph is encoded properly (since node.x used to be buggy!)
    # encoder = GraphNodeEncoder()
    # encoder.EncodeNodes(nx_graph)

    # collect edge_index
    edge_index = torch.tensor(list(nx_graph.edges())).t().contiguous()

    # collect edge_attr
    positions = []
    flows = []

    for i, (_, _, edge_data) in enumerate(nx_graph.edges(data=True)):
        positions.append(edge_data['position'])
        flows.append(edge_data['flow'])

    positions = torch.tensor(positions)
    flows = torch.tensor(flows)

    edge_attr = torch.cat([flows, positions]).view(2, -1).t().contiguous()
    
    # collect x
    types = []
    xs = []
    
    for i, node_data in nx_graph.nodes(data=True):
        types.append(node_data['type'])
        xs.append(node_data['x'][0])

    xs = torch.tensor(xs)
    types = torch.tensor(types)
    
    x = torch.cat([xs, types]).view(2, -1).t().contiguous()

    
    assert edge_attr.size()[0] == edge_index.size()[1], f'edge_attr={edge_attr.size()} size mismatch with edge_index={edge_index.size()}'
    
    if class_label is not None:
        y = torch.tensor(int(class_label)).view(1)  # <1>
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    else:
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    
    return data

In [30]:
from multiprocessing import Pool
import tqdm
import os

def dump(outfile, data, mkdir=True):
    if mkdir:
        outfile.parent.mkdir(exist_ok=True, parents=True)
    with open(outfile, 'wb') as f:
        pickle.dump(data, f)

def load(file):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    return data

In [49]:
def process_single_folder(folder, dump_nx=True, dump_data=True):    
    problems = ""
    out_base = folder.parent
    
    print(f"=== Opening Folder {str(folder)} ===")
    
    #label = int(folder.name)
    # TODO: generate labels!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    label = None
    
    
    files = list(folder.glob('*.ll'))
    
    # iterate over all .ll files in folder and confirm and respectively create the .nx.p and .data.p files
    for i, file in enumerate(files):
        outfile_nx = out_base / (folder.name + '_programl') / '_nx' / (file.name.rsplit('.', 1)[0] + '.nx.p')
        outfile_data = out_base / (folder.name + '_programl') / (file.name.rsplit('.', 1)[0] + '.data.p')

        # find out where to start processing

        # skip entirely?
        if outfile_data.is_file():
            continue

        # start at step 2: nx --> data ?
        if outfile_nx.is_file():
            nx_graph = load(outfile_nx)            
            data = nx2data(nx_graph, class_label=label)
            dump(outfile_data, data)
            continue
        
        # start in the beginning:
        # ~~~ step 1: .ll --> nx ~~~
        if i % 100 == 0:
            print(f"{folder.name} - [{i}/{len(files)}] Processing {str(file)} ...")
        
        with open(file, 'r') as f:
            bytecode = f.read()

        try:
            nx_graph = builder.Build(bytecode) # nx
            if dump_nx:
                dump(outfile_nx, nx_graph)
        except:
            print(f"***** FAILING ON {str(file)} ... renaming file to .ll_ ")
            problems += str(file)
            problems += '\n'
            file.rename(file.with_suffix('.ll_'))
            continue

        # step 2: nx --> data
        data = nx2data(nx_graph, class_label=label)
        dump(outfile_data, data)

    return problems

In [40]:
# download dataset if needed (this is for classifyapp!)
import wget
import zipfile

def download_and_unzip(url, dataset_name, data_folder):
    """
    Download and unzip data set folder from url
    :param url: from which to download
    :param dataset_name: name of data set (for printing)
    :param data_folder: folder in which to put the downloaded data
    """
    print('Downloading', dataset_name, 'data set...')
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    data_zip = wget.download(url, out=data_folder)
    print('\tunzipping...')
    zip_ = zipfile.ZipFile(data_zip, 'r')
    assert os.path.isdir(data_folder), data_folder
    zip_.extractall(data_folder)
    zip_.close()
    print('\tdone')

def download_classifyapp(dataset_path):
    # get Path object
    if type(dataset_path) == str:
        dataset_path = Path(dataset_path)
    dataset_path = dataset_path / 'classifyapp_data'
        
    # Acquire data
    if not dataset_path.exists():
        dataset_path.mkdir(parents=True)
        download_and_unzip('https://polybox.ethz.ch/index.php/s/JOBjrfmAjOeWCyl/download',
                                      'classifyapp_data', str(dataset_path.absolute()))
    else:
        print(f'skipped downloading to {str(dataset_path.absolute())}')

# Main
## *needs action:*

In [41]:
!pwd

/home/zacharias/ProGraML/deeplearning/ml4pl/poj104


In [42]:
# Set where to store the dataset and download automagically
ds_basepath = Path('/mnt/data/llvm/master_thesis_datasets')

# uncomment this line to save data "in place"
#ds_basepath = Path(repo_root) / 'deeplearning/ml4pl/poj104'

logs_basepath = ds_basepath / 'logs' / 'devmap_logs'

ds_basepath.mkdir(parents=True, exist_ok=True)
logs_basepath.mkdir(parents=True, exist_ok=True)

#download_classifyapp(ds_basepath)

In [43]:
# link those places into poj104 folder

data_source = str((ds_basepath / 'devmap_data').absolute())
print(data_source)
data_target = REPO_ROOT + '/deeplearning/ml4pl/poj104/'
print(data_target)

logs_source = str(logs_basepath.absolute())
print(logs_source)
logs_target = REPO_ROOT + '/deeplearning/ml4pl/poj104/'
print(logs_target)

/mnt/data/llvm/master_thesis_datasets/devmap_data
/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/
/mnt/data/llvm/master_thesis_datasets/logs/devmap_logs
/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/


In [44]:
! ln -s {data_source} {data_target}
! ln -s {logs_source} {logs_target}
! ls -lah {str(REPO_ROOT + '/deeplearning/ml4pl/poj104')} | grep devmap

ln: failed to create symbolic link '/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/devmap_data': File exists
ln: failed to create symbolic link '/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/devmap_logs': File exists
lrwxrwxrwx  1 zacharias zacharias   49 Feb  1 19:30 devmap_data -> /mnt/data/llvm/master_thesis_datasets/devmap_data
lrwxrwxrwx  1 zacharias zacharias   54 Feb  1 19:31 devmap_logs -> /mnt/data/llvm/master_thesis_datasets/logs/devmap_logs


## Process devmap .ll files (from NCC release)

In [51]:
# start processing the smaller validation dataset
data_source = Path(data_source)
dataset_path = data_source / 'kernels_ir'
print(dataset_path.name)
print(dataset_path)

kernels_ir
/mnt/data/llvm/master_thesis_datasets/devmap_data/kernels_ir


In [52]:
process_single_folder(dataset_path)

=== Opening Folder /mnt/data/llvm/master_thesis_datasets/devmap_data/kernels_ir ===


''

## fetch labels etc. from csv

In [53]:
import pandas as pd

platform = 'amd'

# Load runtime data
data_file = data_source / f"cgo17-{platform}.csv"
print('\n--- Read data from', data_file)
df = pd.read_csv(data_file)

In [58]:
df

Unnamed: 0.1,Unnamed: 0,benchmark,dataset,comp,rational,mem,localmem,coalesced,atomic,transfer,wgsize,oracle,runtime_cpu,runtime_gpu,src,seq
0,0,amd-app-sdk-3.0-BinomialOption-binomial_options,default,98,8,13,11,2,0,2048,255,GPU,3.291073,1.443983,"__kernel void A(int a, const __global float4* ...","[129 129 129 ..., 0 127 0]"
1,1,amd-app-sdk-3.0-BitonicSort-bitonicSort,default,12,2,6,0,6,0,131072,256,CPU,0.121940,0.279521,"__kernel void A(__global uint* a, const uint b...","[129 129 129 ..., 0 127 0]"
2,2,amd-app-sdk-3.0-BlackScholes-blackScholes,default,220,4,3,0,0,0,3145728,256,GPU,6.076052,2.740855,"__kernel void A(const __global float4* a, int ...","[129 129 129 ..., 26 0 127]"
3,3,amd-app-sdk-3.0-FastWalshTransform-fastWalshTr...,default,8,0,4,0,4,0,4096,256,CPU,0.130003,0.668892,"__kernel void A(__global float* a, __const int...","[129 129 129 ..., 0 127 0]"
4,4,amd-app-sdk-3.0-FloydWarshall-floydWarshallPass,default,11,1,5,0,0,0,524288,256,CPU,0.283447,0.769755,"__kernel void A(__global uint* a, __global uin...","[129 129 129 ..., 127 0 127]"
5,5,amd-app-sdk-3.0-MatrixMultiplication-mmmKernel,default,163,1,12,0,0,0,49152,64,CPU,0.277644,1.120132,"__kernel void A(__global float4* a, __global f...","[ 1 9 1 ..., 0 127 0]"
6,6,amd-app-sdk-3.0-MatrixTranspose-matrixTranspose,default,42,0,16,8,2,0,32768,256,CPU,0.156639,1.009519,"__kernel void A(__global float4* a, __global f...","[129 129 129 ..., 0 127 0]"
7,7,amd-app-sdk-3.0-PrefixSum-group_prefixSum,default,36,10,12,8,2,0,4096,640,CPU,0.484234,0.847908,"__kernel void A(__global float* a, __global fl...","[129 129 129 ..., 0 127 0]"
8,8,amd-app-sdk-3.0-Reduction-reduce,default,6,3,7,4,2,0,8208,256,CPU,0.161025,0.887437,"__kernel void A(__global uint4* a, __global ui...","[129 129 129 ..., 0 127 0]"
9,9,amd-app-sdk-3.0-ScanLargeArrays-ScanLargeArrays,default,32,3,21,14,4,0,263168,128,CPU,0.336185,0.793181,"__kernel void A(__global float* a, __global fl...","[129 129 129 ..., 127 0 127]"


In [70]:
# Get list of source file names
#data_folder = os.path.join(data_folder, 'kernels_seq')
input_files = df["benchmark"].values   # list of strings of benchmark names
dataset = df["dataset"].values         # list of strings of dataset descriptions
aux_transfer_size = df["transfer"].values
aux_wg_size = df["wgsize"].values
oracle = df['oracle'].values
runtime_cpu = df['runtime_cpu'].values
runtime_gpu = df['runtime_gpu'].values



num_files = len(input_files)
print('\n--- Preparing to read', num_files, 'input files')



--- Preparing to read 680 input files


str

In [62]:
import tqdm
import pickle

In [81]:
torch.tensor(runtime_cpu[0]).dtype

torch.float32

In [88]:
data_list = []
for i in tqdm.tqdm(range(num_files)):
    filename = input_files[i]
    dat = dataset[i]
    if filename[:3] == "npb":
        # concatenate data set size
        filename += '_' + str(dat)
        
    file = data_source / 'kernels_ir_programl' / (filename + '.data.p')
    if file.exists():
        # load preprocessed torch_geometric.data.Data
        with open(file, 'rb') as f:
            data = pickle.load(f)
    else:
        assert False, f'input file not found: {str(file)}'
    
    # add data
    data['y'] = torch.tensor(1) if oracle[i] == 'GPU' else torch.tensor(0)
    data['aux_in'] = torch.tensor([aux_transfer_size[i], aux_wg_size[i]])
    data['runtime'] = torch.tensor([runtime_cpu[i], runtime_gpu[i]])
    
    data_list.append(data)
    dump(file, data)

100%|██████████| 680/680 [00:00<00:00, 1493.66it/s]


In [90]:
data

Data(aux_in=[2], edge_attr=[46, 2], edge_index=[2, 46], runtime=[2], x=[33, 2], y=[])