In [7]:
import json
import shutil
import os
from json import JSONDecodeError

def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

from datasets import load_dataset
def GetWscTargets():

        dataset = load_dataset('stjokerli/TextToText_wsc_seqio', split='test')

        target_map={}
        for row in dataset:
            target_map[row['idx']]=row['targets']
        return target_map

def GetMultiRCTargets(input_file):
    file1 = open(input_file, 'r')
    dataset = file1.readlines()
    
    result={}
    for row in dataset:
        
#         print(row)
        
        pidx,qidx,aidx=row.split(', ')[0].split(': ')[1].split('-')
        pidx,qidx,aidx=int(pidx),int(qidx),int(aidx)
        
        target=int((row[:-2].split(', ')[1].split(': ')[1][1:-1])=='True')
        question_lablel_dict={"idx": aidx, "label": target}
        if pidx not in result.keys():
            result[pidx]={qidx:[question_lablel_dict]}
        else:
            if qidx not in result[pidx].keys():
                result[pidx][qidx]=[question_lablel_dict]
            else:
                result[pidx][qidx].append(question_lablel_dict)

    result2=[]
    for p,v in result.items():
        question_list=[]
        for q,a in v.items():
            question_list.append({"idx": q, "answers":a})
        result2.append({"idx": p, "passage":{"questions":question_list}})    

    return result2


def GetRecordTargets(input_file):
    
    file1 = open(input_file, 'r')
    dataset = file1.readlines()
    
    result=[]
    for row in dataset:
        try:
            result.append(json.loads(row[:-1]))
        except JSONDecodeError:
            temp=row[:-1].split(': "')
            temp[-1]=temp[-1][:-2].replace('"',r"'")+'"}'
            result.append(json.loads(': "'.join(temp)))
            
    return result

def MoveFiles(input_file,output_file):

    
    print('Starting',task_name)
    if task_name=='MultiRC':
        
        contents=GetMultiRCTargets(input_file)
    
    elif task_name=='ReCoRD':
        
        contents=GetRecordTargets(input_file)
        
    else:
        
        contents=load_jsonl(input_file)
    
    if task_name in ['BoolQ','WiC']:
        for e in contents:
            e['label']=e['label'].lower()
    
    elif task_name=='COPA':
        for e in contents:
            e['label']=int(e['label'][-1])-1
    
    elif task_name=='WSC':
        target_map=GetWscTargets()
        for e in contents:
            answer=target_map[e['idx']]==e['label']
            e['label']=str(answer)
    return dump_jsonl(contents, output_file, append=False)

In [8]:
original_file_name='generated_predictions_for_submission.txt'
model_checkpoints_root='/workspace/w266_final_project/model_checkpoints/'

In [9]:
output_path='/workspace/w266_final_project/submissions/SuperGlue/20220322/'

In [14]:
location_mapping={
    # they need all data inorder to submit, so we give them the dummy data
    'AX-g':'/workspace/w266_final_project/submissions/SuperGlue/OfficialSample/axg',
    'AX-b':'/workspace/w266_final_project/submissions/SuperGlue/OfficialSample/axb',
    
    #transfer set
    'BoolQ':model_checkpoints_root+'Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_boolq',
    'CB':model_checkpoints_root+'Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_cb',
    'COPA':model_checkpoints_root+'Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_copa',
    'RTE':model_checkpoints_root+'Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_rte',
    'WiC':model_checkpoints_root+'Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_wic',
    'WSC':model_checkpoints_root+'Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_wsc',
    'ReCoRD':model_checkpoints_root+'Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_record',
    'MultiRC':model_checkpoints_root+'Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_multirc',
    
    #prefix set
#     'BoolQ':model_checkpoints_root+'Wanyu_boolq_prefix_bn50',
#     'CB':model_checkpoints_root+'cb_prefix_bn50',
#     'COPA':model_checkpoints_root+'Wanyu_copa_prefix_bn50',
#     'RTE':model_checkpoints_root+'Wanyu_rte_prefix_bn50',
#     'WiC':model_checkpoints_root+'Wanyu_wic_prefix_bn50',
#     'WSC':model_checkpoints_root+'Wanyu_wsc_prefix_bn50',
#     'ReCoRD':model_checkpoints_root+'Wanyu_record_prefix_bn50',    
#     'MultiRC':model_checkpoints_root+'Wanyu_multirc_prefix_bn50',


}

In [15]:
for task_name,check_point_folder in location_mapping.items():

    input_file=check_point_folder+'/'+original_file_name
    output_file=output_path+task_name+'.jsonl'
    
    if os.path.isdir(output_path):
        pass
    else:
        os.mkdir(output_path)
        
    MoveFiles(input_file,output_file)

Starting AX-g
Loaded 356 records from /workspace/w266_final_project/submissions/SuperGlue/OfficialSample/axg/generated_predictions_for_submission.txt
Wrote 356 records to /workspace/w266_final_project/submissions/SuperGlue/20220322/AX-g.jsonl
Starting AX-b
Loaded 1104 records from /workspace/w266_final_project/submissions/SuperGlue/OfficialSample/axb/generated_predictions_for_submission.txt
Wrote 1104 records to /workspace/w266_final_project/submissions/SuperGlue/20220322/AX-b.jsonl
Starting BoolQ
Loaded 3245 records from /workspace/w266_final_project/model_checkpoints/Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_boolq/generated_predictions_for_submission.txt
Wrote 3245 records to /workspace/w266_final_project/submissions/SuperGlue/20220322/BoolQ.jsonl
Starting CB
Loaded 250 records from /workspace/w266_final_project/model_checkpoints/Prefix_bn50_target_tasks/Wanyu_mnli_prefix_bn50_cb/generated_predictions_for_submission.txt
Wrote 250 records to /workspace/w266_final_project/submiss

Using custom data configuration stjokerli--TextToText_wsc_seqio-e801d03970369d66
Reusing dataset parquet (/root/.cache/huggingface/datasets/parquet/stjokerli--TextToText_wsc_seqio-e801d03970369d66/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


Wrote 146 records to /workspace/w266_final_project/submissions/SuperGlue/20220322/WSC.jsonl
Starting ReCoRD
Wrote 10000 records to /workspace/w266_final_project/submissions/SuperGlue/20220322/ReCoRD.jsonl
Starting MultiRC
Wrote 166 records to /workspace/w266_final_project/submissions/SuperGlue/20220322/MultiRC.jsonl


In [16]:
#zip the folder
file=shutil.make_archive(output_path, 'zip', output_path)
# move the zip file into the folder
shutil.move(file,output_path)

'/workspace/w266_final_project/submissions/SuperGlue/20220322/20220322.zip'