# Dataset generation: ProGraML x Threadcoarsening <a class='tocSkip'>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Configs" data-toc-modified-id="Configs-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Configs</a></span></li><li><span><a href="#Dev:-Helper-Functions" data-toc-modified-id="Dev:-Helper-Functions-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Dev: Helper Functions</a></span><ul class="toc-item"><li><span><a href="#Dev:-nx2data" data-toc-modified-id="Dev:-nx2data-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Dev: nx2data</a></span></li></ul></li><li><span><a href="#Main" data-toc-modified-id="Main-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Main</a></span><ul class="toc-item"><li><span><a href="#needs-action:" data-toc-modified-id="needs-action:-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span><em>needs action:</em></a></span></li><li><span><a href="#Process-.ll-files-(from-NCC-release)" data-toc-modified-id="Process-.ll-files-(from-NCC-release)-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Process .ll files (from NCC release)</a></span></li><li><span><a href="#fetch-labels-etc.-from-csv" data-toc-modified-id="fetch-labels-etc.-from-csv-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>fetch labels etc. from csv</a></span></li><li><span><a href="#Dataset-requirements-for-training:" data-toc-modified-id="Dataset-requirements-for-training:-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>Dataset requirements for training:</a></span></li></ul></li><li><span><a href="#TBD:-Implementing-going-from-predictions-to-resultant-speedups" data-toc-modified-id="TBD:-Implementing-going-from-predictions-to-resultant-speedups-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>TBD: Implementing going from predictions to resultant speedups</a></span></li><li><span><a href="#TBD:-Implementing-KFold" data-toc-modified-id="TBD:-Implementing-KFold-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>TBD: Implementing KFold</a></span></li></ul></div>

# Configs

In [1]:
# Set editor width to something sane
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
# get root repository path
a = !pwd
REPO_ROOT = a[0].rsplit('ProGraML', maxsplit=1,)[0] + 'ProGraML'
print(REPO_ROOT)
#insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, REPO_ROOT)

/home/zacharias/ProGraML


In [3]:
from pathlib import Path
import pickle

import numpy as np
from matplotlib import pyplot as plt
import networkx as nx
import tqdm
import torch
from torch_geometric.data import Data, DataLoader, InMemoryDataset
import torch_geometric

In [4]:
#from google.protobuf import text_format
#from deeplearning.ml4pl.graphs import programl
#from deeplearning.ml4pl.graphs.labelled import graph_tuple
#from labm8.py import app

In [5]:
from deeplearning.ml4pl.graphs.unlabelled.llvm2graph import graph_builder

builder = graph_builder.ProGraMLGraphBuilder()

# Dev: Helper Functions
`
unreachable,232
ret void,263
!UNK,8564
!IDENTIFIER, 8565
magic/root/ I forgot, 8567
`

In [6]:
#nx_graph = builder.Build(ll)

## Dev: nx2data

In [7]:
def nx2data(nx_graph, class_label=None):
    r"""Converts a :obj:`networkx.Graph` or :obj:`networkx.DiGraph` to a
    :class:`torch_geometric.data.Data` instance.

    Args:
        G (networkx.Graph or networkx.DiGraph): A networkx graph.
    """

    # make sure the nx_graph is encoded properly (since node.x used to be buggy!)
    # encoder = GraphNodeEncoder()
    # encoder.EncodeNodes(nx_graph)

    # collect edge_index
    edge_index = torch.tensor(list(nx_graph.edges())).t().contiguous()

    # collect edge_attr
    positions = []
    flows = []

    for i, (_, _, edge_data) in enumerate(nx_graph.edges(data=True)):
        positions.append(edge_data['position'])
        flows.append(edge_data['flow'])

    positions = torch.tensor(positions)
    flows = torch.tensor(flows)

    edge_attr = torch.cat([flows, positions]).view(2, -1).t().contiguous()
    
    # collect x
    types = []
    xs = []
    
    for i, node_data in nx_graph.nodes(data=True):
        types.append(node_data['type'])
        xs.append(node_data['x'][0])

    xs = torch.tensor(xs)
    types = torch.tensor(types)
    
    x = torch.cat([xs, types]).view(2, -1).t().contiguous()

    
    assert edge_attr.size()[0] == edge_index.size()[1], f'edge_attr={edge_attr.size()} size mismatch with edge_index={edge_index.size()}'
    
    if class_label is not None:
        y = torch.tensor(int(class_label)).view(1)  # <1>
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    else:
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    
    return data

In [8]:
from multiprocessing import Pool
import tqdm
import os

def dump(outfile, data, mkdir=True):
    if mkdir:
        outfile.parent.mkdir(exist_ok=True, parents=True)
    with open(outfile, 'wb') as f:
        pickle.dump(data, f)

def load(file):
    with open(file, 'rb') as f:
        data = pickle.load(f)
    return data

In [9]:
def process_single_folder(folder, dump_nx=True, dump_data=True):    
    problems = ""
    out_base = folder.parent
    
    print(f"=== Opening Folder {str(folder)} ===")
    
    #label = int(folder.name)
    # TODO: generate labels!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    label = None
    
    
    files = list(folder.glob('*.ll'))
    
    # iterate over all .ll files in folder and confirm and respectively create the .nx.p and .data.p files
    for i, file in enumerate(files):
        outfile_nx = out_base / (folder.name + '_programl') / '_nx' / (file.name.rsplit('.', 1)[0] + '.nx.p')
        outfile_data = out_base / (folder.name + '_programl') / (file.name.rsplit('.', 1)[0] + '.data.p')

        # find out where to start processing

        # skip entirely?
        if outfile_data.is_file():
            continue

        # start at step 2: nx --> data ?
        if outfile_nx.is_file():
            nx_graph = load(outfile_nx)            
            data = nx2data(nx_graph, class_label=label)
            dump(outfile_data, data)
            continue
        
        # start in the beginning:
        # ~~~ step 1: .ll --> nx ~~~
        if i % 100 == 0:
            print(f"{folder.name} - [{i}/{len(files)}] Processing {str(file)} ...")
        
        with open(file, 'r') as f:
            bytecode = f.read()

        try:
            nx_graph = builder.Build(bytecode) # nx
            if dump_nx:
                dump(outfile_nx, nx_graph)
        except:
            print(f"***** FAILING ON {str(file)} ... renaming file to .ll_ ")
            problems += str(file)
            problems += '\n'
            file.rename(file.with_suffix('.ll_'))
            continue

        # step 2: nx --> data
        data = nx2data(nx_graph, class_label=label)
        dump(outfile_data, data)

    return problems

In [19]:
# download dataset if needed (this is for classifyapp!)
import wget
import zipfile

def download_and_unzip(url, dataset_name, data_folder):
    """
    Download and unzip data set folder from url
    :param url: from which to download
    :param dataset_name: name of data set (for printing)
    :param data_folder: folder in which to put the downloaded data
    """
    print('Downloading', dataset_name, 'data set...')
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    data_zip = wget.download(url, out=data_folder)
    print('\tunzipping...')
    zip_ = zipfile.ZipFile(data_zip, 'r')
    assert os.path.isdir(data_folder), data_folder
    zip_.extractall(data_folder)
    zip_.close()
    print('\tdone')

def download_classifyapp(dataset_path):
    # get Path object
    if type(dataset_path) == str:
        dataset_path = Path(dataset_path)
    dataset_path = dataset_path / 'classifyapp_data'
        
    # Acquire data
    if not dataset_path.exists():
        dataset_path.mkdir(parents=True)
        download_and_unzip('https://polybox.ethz.ch/index.php/s/JOBjrfmAjOeWCyl/download',
                                      'classifyapp_data', str(dataset_path.absolute()))
    else:
        print(f'skipped downloading to {str(dataset_path.absolute())}')
        
        

def download_threadcoarsening(dataset_path):
    # get Path object
    if type(dataset_path) == str:
        dataset_path = Path(dataset_path)
    dataset_path = dataset_path / 'threadcoarsening_data'
        
    # Acquire data
    if not dataset_path.exists():
        dataset_path.mkdir(parents=True)
        download_and_unzip('https://polybox.ethz.ch/index.php/s/Dl8v8dKbuoWS3Ck/download',
                                      'threadcoarsening_data',  str(dataset_path.absolute()))
    else:
        print(f'skipped downloading to {str(dataset_path.absolute())}')

# Main
## *needs action:*

In [None]:
# download threadcoarsening
dataset_name = 'threadcoarsening'

In [11]:
!pwd

/home/zacharias/ProGraML/deeplearning/ml4pl/poj104


In [20]:
# Set where to store the dataset and download automagically
ds_basepath = Path('/mnt/data/llvm/master_thesis_datasets')

# uncomment this line to save data "in place"
#ds_basepath = Path(repo_root) / 'deeplearning/ml4pl/poj104'

logs_basepath = ds_basepath / 'logs' / f'{dataset_name}_logs'

ds_basepath.mkdir(parents=True, exist_ok=True)
logs_basepath.mkdir(parents=True, exist_ok=True)

#download threadcoarsening
download_threadcoarsening(ds_basepath)

Downloading threadcoarsening_data data set...
	unzipping...
	done


In [15]:
# link those places into poj104 folder

data_source = str((ds_basepath / f'{dataset_name}_data').absolute())
print(data_source)
data_target = REPO_ROOT + '/deeplearning/ml4pl/poj104/'
print(data_target)

logs_source = str(logs_basepath.absolute())
print(logs_source)
logs_target = REPO_ROOT + '/deeplearning/ml4pl/poj104/'
print(logs_target)

/mnt/data/llvm/master_thesis_datasets/threadcoarsening_data
/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/
/mnt/data/llvm/master_thesis_datasets/logs/threadcoarsening_logs
/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/


In [17]:
! ln -s {data_source} {data_target}
! ln -s {logs_source} {logs_target}
! ls -lah {str(REPO_ROOT + '/deeplearning/ml4pl/poj104')} | grep {dataset_name}

ln: failed to create symbolic link '/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/threadcoarsening_data': File exists
ln: failed to create symbolic link '/home/zacharias/ProGraML/deeplearning/ml4pl/poj104/threadcoarsening_logs': File exists
lrwxrwxrwx  1 zacharias zacharias   59 Feb  2 12:12 threadcoarsening_data -> /mnt/data/llvm/master_thesis_datasets/threadcoarsening_data
lrwxrwxrwx  1 zacharias zacharias   64 Feb  2 12:12 threadcoarsening_logs -> /mnt/data/llvm/master_thesis_datasets/logs/threadcoarsening_logs


## Process .ll files (from NCC release)

In [21]:
data_source = Path(data_source)
dataset_path = data_source / 'kernels_ir'
print(dataset_path.name)
print(dataset_path)

kernels_ir
/mnt/data/llvm/master_thesis_datasets/threadcoarsening_data/kernels_ir


In [22]:
process_single_folder(dataset_path)

=== Opening Folder /mnt/data/llvm/master_thesis_datasets/threadcoarsening_data/kernels_ir ===
kernels_ir - [0/17] Processing /mnt/data/llvm/master_thesis_datasets/threadcoarsening_data/kernels_ir/sgemm.ll ...


''

## fetch labels etc. from csv

In [26]:
import pandas as pd

## Dataset requirements for training:

* Leave one out cross validation per plattform.
* Not all coarsening factors have runtimes, so if model predicts "too high" cf, we need to "clamp it down" to the next highest existing one to compute runtimes.

```

    # The runtimes of some coarsening factors are not recorded in the data table. If that is the case for
    # the predicted cf, clamp it down to the highest cf for which the runtime is recorded
    p = min(p, 2 ** (len(X_cc[test_index[0]]) - 1))
```

In [53]:
def platform2str(platform):
    if platform == "Fermi":
        return "NVIDIA GTX 480"
    elif platform == "Kepler":
        return "NVIDIA Tesla K20c"
    elif platform == "Cypress":
        return "AMD Radeon HD 5900"
    elif platform == "Tahiti":
        return "AMD Tahiti 7970"
    else:
        raise LookupError

In [122]:
def get_all_runtimes(platform, df, oracles):
    all_runtimes = {}
    for kernel in oracles['kernel']:
        kernel_r = []
        for cf in [1, 2, 4, 8, 16, 32]:
            row = df[(df['kernel'] == kernel) & (df['cf'] == cf)]
            if len(row) == 1:
                kernel_r.append(float(row[f'runtime_{platform}'].values))
            elif len(row) == 0:
                print(f' kernel={kernel:>20} is missing cf={cf}. Ad-hoc inserting last existing cf!')
                kernel_r.append(kernel_r[-1])
            else:
                raise
        all_runtimes[kernel] = kernel_r
    return all_runtimes

In [123]:
# set values:
platform = "Cypress"

In [124]:
# Read runtime info
oracle_file = data_source / "pact-2014-oracles.csv"
oracles = pd.read_csv(oracle_file)

runtimes_file = data_source / "pact-2014-runtimes.csv"
df = pd.read_csv(runtimes_file)

In [125]:
# get nice runtimes dict
runtimes_dict = get_all_runtimes(platform)

 kernel=        binarySearch is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=        blackscholes is missing cf=16. Ad-hoc inserting last existing cf!
 kernel=        blackscholes is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=         convolution is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=           dwtHaar1D is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=           fastWalsh is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=                mriQ is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=              mvCoal is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=            mvUncoal is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=               nbody is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=              reduce is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=                spmv is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=        

In [127]:
# get oracle labels
y = np.array([cfs.index(int(x)) for x in oracles["cf_" + platform]], dtype=np.int64)

# sanity check
for i, (k, v) in enumerate(runtimes_dict.items()):
    assert int(y[i]) == np.argmin(v)

In [128]:
# load graphs and add attributes
data_list = []

kernels = oracles["kernel"].values  # list of strings of kernel names

for kernel in kernels:
    file = data_source / 'kernels_ir_programl' / (kernel + '.data.p')
    assert file.exists(), f'input file not found: {file}'
    with open(file, 'rb') as f:
        data = pickle.load(f)
    # add attributes
    data['y'] = torch.tensor(np.argmin(runtimes_dict[kernel]), dtype=torch.long)
    data['runtimes'] = torch.tensor(runtimes_dict[kernel])
    print(data)

Data(edge_attr=[131, 2], edge_index=[2, 131], runtimes=[6], x=[75, 2], y=[])
Data(edge_attr=[233, 2], edge_index=[2, 233], runtimes=[6], x=[137, 2], y=[])
Data(edge_attr=[376, 2], edge_index=[2, 376], runtimes=[6], x=[209, 2], y=[])
Data(edge_attr=[343, 2], edge_index=[2, 343], runtimes=[6], x=[188, 2], y=[])
Data(edge_attr=[70, 2], edge_index=[2, 70], runtimes=[6], x=[43, 2], y=[])
Data(edge_attr=[94, 2], edge_index=[2, 94], runtimes=[6], x=[58, 2], y=[])
Data(edge_attr=[225, 2], edge_index=[2, 225], runtimes=[6], x=[128, 2], y=[])
Data(edge_attr=[465, 2], edge_index=[2, 465], runtimes=[6], x=[242, 2], y=[])
Data(edge_attr=[465, 2], edge_index=[2, 465], runtimes=[6], x=[242, 2], y=[])
Data(edge_attr=[265, 2], edge_index=[2, 265], runtimes=[6], x=[148, 2], y=[])
Data(edge_attr=[286, 2], edge_index=[2, 286], runtimes=[6], x=[158, 2], y=[])
Data(edge_attr=[1104, 2], edge_index=[2, 1104], runtimes=[6], x=[528, 2], y=[])
Data(edge_attr=[183, 2], edge_index=[2, 183], runtimes=[6], x=[107, 2

--- Cross validation step [ 1 /  17 ]


In [105]:
platform = 'Fermi'

cfs = [1, 2, 4, 8, 16, 32]  # thread coarsening factors

device_list = ["Cypress", "Tahiti", "Fermi", "Kepler"]

oracle_runtimes = np.array([float(x) for x in oracles["runtime_" + platform]])
#y = np.array([int(x) for x in oracles["cf_" + platform]], dtype=np.int64)
y = np.array([cfs.index(int(x)) for x in oracles["cf_" + platform]], dtype=np.int64)
#y_1hot = get_onehot(oracles, platform)




 kernel=        binarySearch is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=        blackscholes is missing cf=16. Ad-hoc inserting last existing cf!
 kernel=        blackscholes is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=         convolution is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=           dwtHaar1D is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=           fastWalsh is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=                mriQ is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=              mvCoal is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=            mvUncoal is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=               nbody is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=              reduce is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=                spmv is missing cf=32. Ad-hoc inserting last existing cf!
 kernel=        

# TBD: Implementing going from predictions to resultant speedups

# TBD: Implementing KFold

In [133]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=len(y), shuffle=False)
for j, (train_index, test_index) in enumerate(kf.split(y)):
    print('--- Cross validation step [', j+1, '/ ', len(y), ']')
    break

--- Cross validation step [ 1 /  17 ]
