In [1]:
import os
from collections import defaultdict
from tqdm import tqdm
import threading
import multiprocessing
from multiprocessing import Pool
import pickle 

import numpy as np

In [2]:
ADJ_PATH = '/proj/rcs-hdd/kpei/symmetry_source_dataset/training_result/adj_matrices'
TEXT_FILE = '/proj/rcs-hdd/kpei/symmetry_source_dataset/training_result/train.code'
LABELS_FILE = '/proj/rcs-hdd/kpei/symmetry_source_dataset/training_result/train.tgt'

In [3]:
with open(TEXT_FILE, 'r') as f:
    data = f.readlines()

In [4]:
with open(LABELS_FILE, 'r') as f:
    labels = f.readlines()

In [5]:
def get_line_inds(line):
    return [0] + [i + 1 for i, token in enumerate(line[:-1]) if token == '\\n']

In [6]:
code = []
permutations = []

In [40]:
def get_invariant_permutations(adj, max_permutations):
    n_lines = len(adj)
    indegrees = [sum(col) for col in zip(*adj)]
    curr = []
    invariant_permutations = []

    def __generate_topo_orderings():
        if len(invariant_permutations) == max_permutations:
            return
        if len(curr) == n_lines:
            invariant_permutations.append(tuple(curr[:]))
            return
        for node, ind in enumerate(indegrees):
            if node in curr:
                continue
            if ind == 0:
                curr.append(node)
                for node_dest, is_edge in enumerate(adj[node]):
                    if is_edge == 1:
                        indegrees[node_dest] -= 1
                __generate_topo_orderings()
                curr.pop()
                for node_dest, is_edge in enumerate(adj[node]):
                    if is_edge == 1:
                        indegrees[node_dest] += 1
    __generate_topo_orderings()
    return invariant_permutations

In [18]:
def process(i, n_permutations):
    adj_old = np.load(os.path.join(ADJ_PATH, f'{i}.npz'))['arr_0']
    tokens = data[i].split(' ')[:-1]
    label = labels[i][:-1]
    tokens = ['\n' if t == '\\n' else t for t in tokens]
    line_inds = [0] + [i + 1 for i, token in enumerate(tokens[:-1]) if token == '\n'] # beginning index of each new line
    num_lines = len(line_inds)
    lines = [' '.join(tokens[line_inds[i]:line_inds[i+1]]) for i in range(len(line_inds) - 1)] + [' '.join(tokens[line_inds[-1]:])]
    adj_new = [[0]*num_lines for _ in range(num_lines)]
    for adj_new_row in range(num_lines): # fill in each row of new adj matrix
        adj_old_row = line_inds[adj_new_row] 
        for adj_new_dest, line_ind in enumerate(line_inds):
            if adj_old[adj_old_row][line_ind] == 1:
                adj_new[adj_new_row][adj_new_dest] = 1
    permutations = get_invariant_permutations(adj_new, n_permutations)
    return lines, permutations, label
    
    

In [9]:
N_THREADS = 64

code_per_thread = [[] for _ in range(N_THREADS)]
permutations_per_thread = [[] for _ in range(N_THREADS)]
labels_per_thread = [[] for _ in range(N_THREADS)]

num_per_thread = len(data) // N_THREADS


In [43]:
NUM_PROCESSES = 64
MAX_PERMUTATIONS = 4

process_args = [(i, MAX_PERMUTATIONS) for i in range(len(data))]

In [20]:
with Pool(NUM_PROCESSES) as p: 
    processed_data = p.starmap(process, process_args)

In [50]:
processed_data = [{
    'code': line, 
    'permutations': permutations,
    'label': label
} for (line, permutations, label) in processed_data
]

In [6]:
STORAGE_PATH = '/proj/rcs-hdd/aj3051/symmetry'

In [52]:
serialized_data = pickle.dumps(processed_data)
with open(os.path.join(STORAGE_PATH, 'data.pickle'), 'wb') as f:
    f.write(serialized_data)

In [47]:
def filter_processed_data(processed_data):
    filtered_data = [] 
    for (line, permutations, label) in processed_data:
        if len(permutations) == MAX_PERMUTATIONS: 
            filtered_data.append({
                'code': line, 
                'permutations': permutations,
                'label': label
            })
    return filtered_data

In [45]:
filtered_data = filter_processed_data(processed_data)

In [None]:
filtered_data = pickle.dumps(filtered_data)
with open(os.path.join(STORAGE_PATH, 'data_four_permutations.pickle'), 'wb') as f:
    f.write(filtered_data)

In [53]:
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [54]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-1.3b-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
with open(os.path.join(STORAGE_PATH, 'data_four_permutations.pickle'), 'rb')  as f:
    filtered_data = pickle.load(f)

In [13]:
filtered_data[6]

(['void ( ) { \n',
  'Map < String , Object > vars = new Hash Map < String , Object > ( ) ; \n',
  'Map < String , Object > ctx = new Hash Map < String , Object > ( ) ; \n',
  'Map < String , Object > obj 1 = new Hash Map < String , Object > ( ) ; \n',
  'obj 1 . put ( "prop1" , "value1" ) ; \n',
  'ctx . put ( "obj1" , obj 1 ) ; \n',
  'vars . put ( "ctx" , ctx ) ; \n',
  'Executable Script executable = se . executable ( new Compiled Script ( Script Service . Script Type . INLINE , "testJavaScriptObjectMapInter" , "js" , se . compile ( "ctx.obj2_=_{};_ctx.obj2.prop2_=_\'value2\';_ctx.obj1.prop1_=_\'uvalue1\'" , Collections . empty Map ( ) ) ) , vars ) ; \n',
  'executable . run ( ) ; \n',
  'ctx = ( Map < String , Object > ) executable . unwrap ( vars . get ( "ctx" ) ) ; \n',
  'assert That ( ctx . contains Key ( "obj1" ) , equal To ( true ) ) ; \n',
  'assert That ( ( String ) ( ( Map < String , Object > ) ctx . get ( "obj1" ) ) . get ( "prop1" ) , equal To ( "uvalue1" ) ) ; \n',
  '