# Graphs generation

In [1]:
from sampo.generator.base import SimpleSynthetic

In [38]:
GRAPHS_TOP_BORDER = 200
GRAPHS_COUNT = 50

In [39]:
ss = SimpleSynthetic(256)

# Metrics calculation

In [40]:
from sampo.schemas.graph import WorkGraph


def metric_vertex_count(wg: WorkGraph) -> float:
    return wg.vertex_count

def metric_min_children(wg: WorkGraph) -> float:
    return min((len(node.children) for node in wg.nodes if node.children))

def metric_max_children(wg: WorkGraph) -> float:
    return max((len(node.children) for node in wg.nodes if node.children))

def metric_min_parents(wg: WorkGraph) -> float:
    return min((len(node.parents) for node in wg.nodes if node.parents))

def metric_max_parents(wg: WorkGraph) -> float:
    return max((len(node.parents) for node in wg.nodes if node.parents))

In [43]:
from sampo.schemas.time import Time
from sampo.scheduler.topological.base import TopologicalScheduler
from sampo.scheduler.heft.base import HEFTScheduler, HEFTBetweenScheduler

import pandas as pd

contractors = [ss.contractor(10)]
schedulers = [HEFTScheduler(), HEFTBetweenScheduler(), TopologicalScheduler()]

def argmin(array) -> int:
    res = 0
    res_v = int(Time.inf())
    for i, v in enumerate(array):
        if v < res_v:
            res_v = v
            res = i
    return res

def generate_graphs(labels_count: int, bin_size: int) -> list[list[int], int]:
    bins = [0 for _ in range(labels_count)]
    result = []

    while any((bin < bin_size for bin in bins)):
        wg = ss.work_graph(top_border=GRAPHS_TOP_BORDER)
        encoding = [
            metric_vertex_count(wg),
            metric_min_children(wg),
            metric_max_children(wg),
            metric_min_parents(wg),
            metric_max_parents(wg)
        ]
        schedulers_results = [int(scheduler.schedule(wg, contractors).execution_time) for scheduler in schedulers]
        generated_label = argmin(schedulers_results)

        if bins[generated_label] < bin_size:
            bins[generated_label] += 1
            result.append((encoding, generated_label))
            if bins[generated_label] % 10 == 0:
                print(f'{generated_label}: {bins[generated_label]}/{bin_size} processed')
    return result

dataset_raw = generate_graphs(len(schedulers), GRAPHS_COUNT)
dataset_raw

1: 10/150 processed
1: 20/150 processed
1: 30/150 processed
1: 40/150 processed
1: 50/150 processed
0: 10/150 processed
0: 20/150 processed
2: 10/150 processed
0: 30/150 processed
0: 40/150 processed
0: 50/150 processed
2: 20/150 processed
2: 30/150 processed
2: 40/150 processed
2: 50/150 processed


[([310, 1, 54, 1, 55], 1),
 ([248, 1, 70, 1, 48], 1),
 ([308, 1, 78, 1, 45], 1),
 ([268, 1, 44, 1, 60], 1),
 ([231, 1, 74, 1, 40], 1),
 ([255, 1, 45, 1, 55], 1),
 ([252, 1, 46, 1, 55], 1),
 ([246, 1, 70, 1, 53], 1),
 ([245, 1, 70, 1, 50], 0),
 ([297, 1, 80, 1, 55], 1),
 ([243, 1, 45, 1, 61], 1),
 ([258, 1, 64, 1, 50], 1),
 ([257, 1, 78, 1, 60], 0),
 ([249, 1, 44, 1, 58], 0),
 ([311, 1, 78, 1, 52], 1),
 ([233, 1, 76, 1, 57], 1),
 ([282, 1, 86, 1, 46], 1),
 ([247, 1, 41, 1, 57], 1),
 ([287, 1, 84, 1, 59], 1),
 ([230, 1, 40, 1, 50], 1),
 ([209, 1, 43, 1, 56], 1),
 ([300, 1, 98, 1, 61], 1),
 ([245, 1, 45, 1, 57], 1),
 ([273, 1, 56, 1, 52], 1),
 ([244, 1, 40, 1, 52], 1),
 ([257, 1, 70, 1, 54], 1),
 ([299, 1, 66, 1, 60], 1),
 ([247, 1, 76, 1, 48], 1),
 ([240, 1, 74, 1, 50], 1),
 ([251, 1, 62, 1, 54], 1),
 ([252, 1, 56, 1, 50], 1),
 ([325, 1, 82, 1, 52], 1),
 ([321, 1, 64, 1, 55], 1),
 ([257, 1, 84, 1, 55], 0),
 ([247, 1, 45, 1, 54], 0),
 ([268, 1, 64, 1, 54], 1),
 ([243, 1, 80, 1, 48], 2),
 

In [53]:
import numpy as np

dataset_transposed = np.array(dataset_raw).T
df = pd.DataFrame.from_records(dataset_transposed[0])
df['label'] = dataset_transposed[1]
df

  dataset_transposed = np.array(dataset_raw).T


Unnamed: 0,0,1,2,3,4,label
0,310,1,54,1,55,1
1,248,1,70,1,48,1
2,308,1,78,1,45,1
3,268,1,44,1,60,1
4,231,1,74,1,40,1
...,...,...,...,...,...,...
145,300,1,72,1,51,2
146,254,1,60,1,60,2
147,241,1,44,1,49,2
148,284,1,78,1,58,2


In [54]:
# calculate the minimum uniform sampling size
dataset_size = min(df.groupby('label', group_keys=False).apply(lambda x: len(x)))
dataset_size

50

In [55]:
df = df.groupby('label', group_keys=False).apply(lambda x: x.sample(dataset_size))
df

Unnamed: 0,0,1,2,3,4,label
65,283,1,72,1,53,0
96,266,1,45,1,59,0
92,254,1,41,1,53,0
69,212,1,80,1,57,0
99,238,1,80,1,52,0
...,...,...,...,...,...,...
74,314,1,88,1,54,2
115,240,1,56,1,52,2
126,262,1,88,1,57,2
54,297,1,78,1,58,2


In [56]:
df.to_csv('dataset.csv')