In [134]:
import pandas as pd 
import json 
import os
from tqdm import tqdm
import subprocess
import platform
import threading

# 1- Processing data 

since the data is stringified, we need a way to save it into the correct format (json)

In [3]:
df = pd.read_csv("./dataset/correct-instances.csv")

In [4]:
df.head()

Unnamed: 0,objectId,witness
0,o27825,true
1,o27841,0
2,o67205,0
3,o89637,0
4,o27830,0


In [72]:
CORRECT_INSTANCES_PATH='./dataset/correct-instances'

In [20]:
os.mkdir(CORRECT_INSTANCES_PATH)

In [81]:
len(df.index)

6335

In [97]:
print(f"Processing : {df.index} correct instances")
for i in tqdm(df.index) :
    objectId,witness = df.loc[i]['objectId'],df.loc[i]['witness']
    if str(witness)=='nan' :
        witness = 'null'   
    # print(objectId,witness)
    json_witness = json.loads(witness)
    with open(f'{CORRECT_INSTANCES_PATH}/{objectId}.json','w',encoding='utf-8') as f : 
        f.write(json.dumps(json_witness))

Processing : RangeIndex(start=0, stop=6335, step=1) correct instances


100%|██████████| 6335/6335 [00:08<00:00, 728.66it/s] 


# 2- Transfering data to bracket notation

In [121]:
BRACKET_DIR = './dataset/bracket'

In [99]:
os.makedirs(BRACKET_DIR)

In [100]:
dirs = ['correct-instances','DG','JE','JSF']

In [116]:
BRACKET_SCRIPT_PATH ='./scripts/prepare.sh'
SOURCE_JSON = './dataset/DG/o1_witness'
DESTINATION_BRACKET = './dataset/bracket/DG/o1_witness'

In [126]:
for dir in dirs : 
    DESTINATION_DIR_BRACKET=f'./dataset/bracket/{dir}'
    os.makedirs(DESTINATION_DIR_BRACKET)

In [133]:
def process_bracket_directory(dir):
    for file in tqdm(os.listdir(f'./dataset/{dir}') ): 
        file_name = file.split('.')[0]
        
        SOURCE_JSON=f'./dataset/{dir}/{file_name}'
        DESTINATION_DIR_BRACKET=f'./dataset/bracket/{dir}'
        DESTINATION_BRACKET=f'{DESTINATION_DIR_BRACKET}/{file_name}'
        if platform.system() == 'Windows' : 
            output = subprocess.call(['wsl',BRACKET_SCRIPT_PATH,SOURCE_JSON,DESTINATION_BRACKET])
        else : 
            output = subprocess.call([BRACKET_SCRIPT_PATH,SOURCE_JSON,DESTINATION_BRACKET])

In [135]:
threads = []
for dir in dirs:
    thread = threading.Thread(target=process_bracket_directory, args=(dir,))
    thread.start()
    threads.append(thread)

# Wait for all threads to finish
for thread in threads:
    thread.join()
print("All directories processed.")

  0%|          | 0/6335 [00:00<?, ?it/s]
[A

[A[A


  0%|          | 1/6335 [00:01<2:29:33,  1.42s/it]

[A[A
  0%|          | 2/6335 [00:02<1:55:13,  1.09s/it]

  0%|          | 3/6335 [00:03<1:51:16,  1.05s/it]
[A

[A[A
  0%|          | 4/6335 [00:05<2:18:32,  1.31s/it]

[A[A
  0%|          | 5/6335 [00:06<2:07:50,  1.21s/it]

[A[A
  0%|          | 6/6335 [00:07<2:20:10,  1.33s/it]

[A[A
  0%|          | 8/6335 [00:09<1:55:18,  1.09s/it]

[A[A
  0%|          | 9/6335 [00:10<1:49:05,  1.03s/it]

[A[A
  0%|          | 10/6335 [00:11<1:46:13,  1.01s/it]

[A[A
  0%|          | 11/6335 [00:12<1:41:22,  1.04it/s]

[A[A
[A

  0%|          | 12/6335 [00:13<1:46:03,  1.01s/it]

[A[A
  0%|          | 13/6335 [00:15<2:21:29,  1.34s/it]

[A[A
  0%|          | 14/6335 [00:16<2:15:54,  1.29s/it]
[A

  0%|          | 15/6335 [00:18<2:38:50,  1.51s/it]
[A

  0%|          | 16/6335 [00:20<2:39:55,  1.52s/it]

[A[A
  0%|          | 17/6335 [00:20<2:20:53,  1.34s/it]
[A

  0%|          | 18/6335 [00:21<2:11:21,  1.25s/it]
[

All directories processed.





# 3- Comparing edit distances 