In [1]:
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
import hashlib
import scipy
import torch
from tqdm import tqdm

In [2]:
splits = ['train', 'valid', 'test']

In [3]:
non_zero_inds = {'XD': {split: [] for split in splits}, 
                 'XR': {split: [] for split in splits}}

In [4]:
path_data = Path('./data/t10_datacut')
os.makedirs(path_data, exist_ok=True)

# Dataset

In [5]:
path_ds = Path('./datasets/npz_all/npz/')

In [6]:
ds_type = 'layout'
ds_source = 'xla'
ds_search = 'default'

In [7]:
for split in splits:
    path_files = path_ds / ds_type / ds_source / ds_search / split
    files = os.listdir(path_files)
    num_files = len(files)
    
    check_arr = - np.ones((num_files, 18), dtype=np.int8)
    
    for ix, file in tqdm(enumerate(files)):
        npz_data = np.load(path_files / file)
        cfeats = npz_data['node_config_feat']
        for ifeats in range(18):
            check_minus = np.all(cfeats[:,:,ifeats] < - 0.1)
            check_arr[ix, ifeats] = check_minus
            
    for ifeats in range(18):
        if not np.all(check_arr[:,ifeats]==1):
            non_zero_inds['XD'][split].append(ifeats)

    print(split, non_zero_inds['XD'][split])

61it [02:31,  2.48s/it]


train [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


7it [00:15,  2.19s/it]


valid [0, 1, 2, 3, 6, 7, 8, 9, 12, 13]


8it [00:02,  3.27it/s]

test [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13]





In [8]:
ds_type = 'layout'
ds_source = 'xla'
ds_search = 'random'

In [9]:
for split in splits:
    path_files = path_ds / ds_type / ds_source / ds_search / split
    files = os.listdir(path_files)
    num_files = len(files)
    
    check_arr = - np.ones((num_files, 18), dtype=np.int8)
    
    for ix, file in tqdm(enumerate(files)):
        npz_data = np.load(path_files / file)
        cfeats = npz_data['node_config_feat']
        for ifeats in range(18):
            check_minus = np.all(cfeats[:,:,ifeats] < - 0.1)
            check_arr[ix, ifeats] = check_minus
            
    for ifeats in range(18):
        if not np.all(check_arr[:,ifeats]==1):
            non_zero_inds['XR'][split].append(ifeats)

    print(split, non_zero_inds['XR'][split])

69it [02:05,  1.82s/it]


train [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


7it [00:14,  2.01s/it]


valid [0, 1, 2, 3, 6, 7, 8, 9, 12, 13]


8it [00:02,  3.21it/s]

test [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13]





In [10]:
non_zero_inds

{'XD': {'train': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
  'valid': [0, 1, 2, 3, 6, 7, 8, 9, 12, 13],
  'test': [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13]},
 'XR': {'train': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
  'valid': [0, 1, 2, 3, 6, 7, 8, 9, 12, 13],
  'test': [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13]}}

In [11]:
with open(path_data / 'non_zero_inds.X.conf.json', 'w') as fob:
    json.dump(non_zero_inds, fob, indent=4)

# Check

### V1 - problem with diff configs / should be used short variant

In [24]:
xla_conf_inds = non_zero_inds['XD']['train']
xla_conf_inds

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [21]:
len(xla_conf_inds)

14

In [26]:
for s in non_zero_inds:
    for split in non_zero_inds[s]:
        if set(xla_conf_inds) == set(non_zero_inds[s][split]):
            print(s, split, 'OK')
        else:
            print(s, split, 'Problem')
            print('In common list:')
            print(set(xla_conf_inds) - set(non_zero_inds[s][split]))
            print('In specific list:')
            print(set(non_zero_inds[s][split]) - set(xla_conf_inds))

XD train OK
XD valid Problem
In common list:
{10, 11, 4, 5}
In specific list:
set()
XD test Problem
In common list:
{11, 5}
In specific list:
set()
XR train OK
XR valid Problem
In common list:
{10, 11, 4, 5}
In specific list:
set()
XR test Problem
In common list:
{11, 5}
In specific list:
set()


In [28]:
with open(path_data / 'xla_conf_inds.json', 'w') as fob:
    json.dump(nlp_conf_inds, fob, indent=4)