In [1]:
#should pip install -r requirements.txt, install pytorch with cuda support

In [5]:
import os
usedgpus=[0]
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(str(x) for x in usedgpus)

In [6]:
from datautils import DataProcessor,InputExample
from transformers_rc_finetune import finetune
from multiprocessing import Process
from sklearn.metrics import roc_auc_score
import json

In [7]:
# this should be modified to load InputExamples

class CustomProcessor(DataProcessor):
    def __init__(self, data_dir):
        self.data_dir = data_dir

    def get_train_examples(self):
        return self.get_dir_examples(f'{self.data_dir}/tacred-train-labeled')

    def get_dev_examples(self):
        return self.get_dir_examples(f'{self.data_dir}/tacred-dev-labeled')

    def get_test_examples(self):
        return self.get_dir_examples(f'{self.data_dir}/tacred-test-labeled')

    def get_dir_examples(self, subdir):
        num_pos=num_neg=20
        pos = []
        neg = []
        rel='org:top_members/employees'
        for i in range(10000):
            with open('%s/ann/sent_%05d.json' % (subdir, i), 'r') as fin:
                loaded = json.load(fin)
                args = loaded['gold_relations'][0]['arguments']

                cleaningmap = {'-RRB-': ')', '-LRB-': '(', '-LSB-': '[',
                               '-RSB-': ']', '-LCB-': '{', '-RCB-': '}',
                               '&nbsp;': ' ', '&quot;': "'", '--': '-', '---': '-'}

                def clean_tokens(tokens):
                    return [cleaningmap.get(x, x) for x in tokens]

                text_tokens = clean_tokens(loaded['sentences'][0]['words'])
                spans = []
                args = loaded['gold_relations'][0]['arguments']
                inter = args['object'][0]['tokenInterval']
                spans.append((inter['start'], inter['end']))
                inter = args['subject'][0]['tokenInterval']
                spans.append((inter['start'], inter['end']))
                #             print(loaded['gold_relations'][0]['labels'][0])

                is_positive = loaded['gold_relations'][0]['labels'][0] == rel
                ie = InputExample(guid=1, text_tokens=text_tokens, entities_span=spans,
                                  label='yes_relation' if is_positive else 'no_relation')
                if is_positive:
                    pos.append(ie)
                else:
                    neg.append(ie)
                if len(pos) >= num_pos and len(neg) >= num_neg:
                    break
        return pos[:num_pos]+neg[:num_neg]

    def score(self, predicted, gold_labels):
        auc = roc_auc_score(gold_labels, predicted)
        return {'agg':dict(auc=auc), 'main':auc}

    #this is used later - taskname param
    @staticmethod
    def taskname():
        return 'boolrc'

    def get_labels(self):
        """See base class."""
        return ['no_relation',
                'yes_relation']

In [8]:
# here we train on trainset, halt accroding to score on dev, then finally eval on testset

#these control over/under fitting
max_num_epochs=200
stop_when_no_improvements_on_dev_for_k_evals=5
eval_dev_every_k_batches=100

#these are mainly for controlling cuda mem
batch_size=2
accumelate_k_batches=4
#could also speed up things on gce
use_fp16=False


# running in a subprocess better cleans the cuda mem
def run():
    finetune(data_dir='..',output_dir='out', do_train=True,task='boolrc',nopbar=True,overwrite_output_dir=True,eval_test=True,score_bool_probs=True,
             logging_steps=eval_dev_every_k_batches,stop_train_low_score_k=stop_when_no_improvements_on_dev_for_k_evals,fp16=use_fp16,
             num_train_epochs=max_num_epochs,per_gpu_batch_size=batch_size,gradient_accumulation_steps=accumelate_k_batches)
p = Process(target=run)
p.start()
p.join()

10/29/2019 12:48:48 - DEBUG - urllib3.connectionpool -   Starting new HTTPS connection (1): s3.amazonaws.com:443
10/29/2019 12:48:49 - DEBUG - urllib3.connectionpool -   https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/roberta-large-vocab.json HTTP/1.1" 200 0
10/29/2019 12:48:49 - DEBUG - urllib3.connectionpool -   Starting new HTTPS connection (1): s3.amazonaws.com:443
10/29/2019 12:48:50 - DEBUG - urllib3.connectionpool -   https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/roberta-large-merges.txt HTTP/1.1" 200 0
10/29/2019 12:48:50 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json from cache at ./hugfacecache/1ae1f5b6e2b22b25ccc04c000bb79ca847aa226d0761536b011cf7e5868f0655.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
10/29/2019 12:48:50 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.

In [9]:
# here we are running an already trained model on a dataset
# note that task='boolrc' means the registered dataloader from above (CustomProcessor) will be used for retireving the test dataset
# also note that all checkpoints are evaluated as well 
def run():
    finetune(data_dir='..', output_dir='out', do_train=False, task='boolrc', nopbar=True, eval_test=True, score_bool_probs=True)
p = Process(target=run)
p.start()
p.join()

10/29/2019 13:03:17 - DEBUG - urllib3.connectionpool -   Starting new HTTPS connection (1): s3.amazonaws.com:443
10/29/2019 13:03:18 - DEBUG - urllib3.connectionpool -   https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/roberta-large-vocab.json HTTP/1.1" 200 0
10/29/2019 13:03:18 - DEBUG - urllib3.connectionpool -   Starting new HTTPS connection (1): s3.amazonaws.com:443
10/29/2019 13:03:19 - DEBUG - urllib3.connectionpool -   https://s3.amazonaws.com:443 "HEAD /models.huggingface.co/bert/roberta-large-merges.txt HTTP/1.1" 200 0
10/29/2019 13:03:19 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json from cache at ./hugfacecache/1ae1f5b6e2b22b25ccc04c000bb79ca847aa226d0761536b011cf7e5868f0655.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b
10/29/2019 13:03:19 - INFO - pytorch_transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.