In [6]:
import json
import os
import argparse
from tqdm import tqdm
from allennlp.predictors.predictor import Predictor
import torch


In [7]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]
        
        
def read_from_json(filepath, filename):
    """
    output: list of dicts 
    """
    with open(os.path.join(filepath, filename, 'r')) as f:
        data = []
        for line in f.readlines():
            data.append(json.loads(line))
    return data
            
def read_from_txt(filepath, filename):
    # OUTPUT: LIST -> EACH LINE IN THE FILE IS AN ITEM IN LIST
    with open(os.path.join(filepath, filename), 'r') as f:
        data = []
        for line in f:
            data.append(line.strip())
    return data

def dump_json(filepath, filename, data):
            
    with open(os.path.join(filepath, filename), 'w') as f:
        for sent in tqdm(data):
            json.dump(sent, f)
            f.write('\n')
            
def dictify_sentences(sents):
    return [{'sentence':sent} for sent in sents]

In [52]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path', default='/home/zijiao/work/data/mscoco',
                        help='path to datasets')
    parser.add_argument('--output_path', required=True,
                       help='output processed data to the path')
    parser.add_argument('--batch_size', default=50, type=int,
                       help='size of batches')
    args = parser.parse_args()
    
    batch_size = args.batch_size
    
    if os.path.exists(args.output_path):
        print(f'dir {args.output_path} exists')
    os.system('mkdir {:s}'.format(args.output_path))
    if torch.cuda.is_available():
        cuda_device = 0
    else:
        cuda_device = -1
    # get predictor of srl
    predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/bert-base-srl-2019.06.17.tar.gz", cuda_device=cuda_device)

    # get srls for each train sentence
    train = read_from_txt(args.data_path, 'train_caps.txt')
    train_dicts = dictify_sentences(train)
    
    train_srls = []
    for ba in tqdm(batch(train_dicts, n=batch_size), total=len(train_dicts)//batch_size):
        train_srls.extend(predictor.predict_batch_json(ba))
    
    dump_json(args.output_path, 'train_srls.json', train_srls)

    # get srls for each test sentence
    test = read_from_txt(args.data_path, 'test_caps.txt')
    test_dicts = dictify_sentences(test)
    
    test_srls = []
    for ba in tqdm(batch(test_dicts, n=batch_size), total=len(test_dicts)//batch_size):
        test_srls.extend(predictor.predict_batch_json(ba))

    dump_json(args.output_path, 'test_srls.json', test_srls)


usage: ipykernel_launcher.py [-h] [--data_path DATA_PATH] --output_path
                             OUTPUT_PATH [--batch_size BATCH_SIZE]
ipykernel_launcher.py: error: the following arguments are required: --output_path


SystemExit: 2

In [36]:
#     with open(os.path.join(args.data_path, 'train_caps.txt'), 'r') as f:
#         train = []
#         for line in f:
#             train.append(line.strip())

#     train_dicts = [{'sentence':sent} for sent in train]
#     with open(os.path.join(args.output_path, 'train_srls.json'), 'w') as f:
#         for sent in tqdm(train_srls):
#             json.dump(sent, f)
#             f.write('\n')
        
#     with open(os.path.join(args.data_path, 'test_caps.txt'), 'r') as f:
#         test = []
#         for line in f:
#             test.append(line.strip())

#     test_dicts = [{'sentence':sent} for sent in test]
#     with open(os.path.join(args.output_path, 'test_srls.json'), 'w') as f:
# #         for sent in tqdm(test_srls):
# #             json.dump(sent, f)
# #             f.write('\n')
#     result = predictor.predict(
#       sentence="Did Uriah honestly think he could beat the game in under three hours?"
#     )

In [3]:
ls

'Create dataset.ipynb'   [0m[01;34moutput[0m/                 srl_label_getter.ipynb
 [01;34mexperiments[0m/            README.md               train_new.json
 LICENSE                 srl_json_reader.ipynb


In [14]:
predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/bert-base-srl-2019.06.17.tar.gz", cuda_device=0)

    # get srls for each train sentence
val = read_from_txt('../data/mscoco/', 'dev_caps.txt')
val_dicts = dictify_sentences(val)
    
val_srls = []
for i, ba in tqdm(enumerate(batch(val_dicts, n=50)), total=len(val_dicts)//50):
    if i >= 80:
        val_srls.extend(predictor.predict_batch_json(ba))
    
dump_json('./output/', 'val1000_srls.json', val_srls)

100%|██████████| 100/100 [00:05<00:00, 17.76it/s]
100%|██████████| 1000/1000 [00:00<00:00, 34861.31it/s]


In [18]:
val_srls

[{'verbs': [],
  'words': ['An',
   'old',
   'brick',
   'building',
   'along',
   'side',
   'of',
   'a',
   'river']},
 {'verbs': [],
  'words': ['a',
   'wooden',
   'street',
   'sign',
   'next',
   'to',
   'a',
   'tree',
   'and',
   'a',
   'river']},
 {'verbs': [{'verb': 'says',
    'description': 'A flooded road and [ARG0: a sign] [R-ARG0: that] [V: says] , ` [ARG1: ` East Street `] ` .',
    'tags': ['O',
     'O',
     'O',
     'O',
     'B-ARG0',
     'I-ARG0',
     'B-R-ARG0',
     'B-V',
     'O',
     'O',
     'B-ARG1',
     'I-ARG1',
     'I-ARG1',
     'I-ARG1',
     'O',
     'O']}],
  'words': ['A',
   'flooded',
   'road',
   'and',
   'a',
   'sign',
   'that',
   'says',
   ',',
   '`',
   '`',
   'East',
   'Street',
   '`',
   '`',
   '.']},
 {'verbs': [{'verb': 'is',
    'description': 'A black sign that [V: is] standing in the grass .',
    'tags': ['O', 'O', 'O', 'O', 'B-V', 'O', 'O', 'O', 'O', 'O']},
   {'verb': 'standing',
    'description': '[ARG1: 