Installing requirements for the module

In [1]:
%%writefile requirements.txt
boto3==1.12.36
botocore==1.15.36
certifi==2020.4.5.1
chardet==3.0.4
click==7.1.1
docutils==0.15.2
filelock==3.0.12
idna==2.9
jmespath==0.9.5
joblib==0.14.1
jsonlines==1.2.0
numpy==1.18.2
pandas==1.0.3
python-dateutil==2.8.1
regex==2020.4.4
requests==2.23.0
s3transfer==0.3.3
sacremoses==0.0.38
scikit-learn==0.22.2.post1
scipy==1.4.1
scispacy==0.2.5
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz
sentencepiece==0.1.85
six==1.14.0
tokenizers==0.5.2
torch==1.5.0
tqdm==4.45.0
transformers==2.7.0

Writing requirements.txt


In [None]:
pip install -r requirements.txt

Downloading SciFact database

In [None]:
wget https://scifact.s3-us-west-2.amazonaws.com/release/latest/data.tar.gz
tar -xvf data.tar.gz

### Training

In [None]:
import argparse
import torch
import jsonlines
import os
import numpy as np

from torch.utils.data import Dataset, DataLoader
from transformers import get_cosine_schedule_with_warmup, RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, balanced_accuracy_score

In [None]:
def reduce_abstract(abs_list):
    import re
    len_n = len(abs_list)
    red_abs = ''
    # If abstract length is less than 3, return whole abstract.
    if len_n <= 3:
        red_abs = red_abs+' '.join(abs_list)
        red_abs = re.sub("\s\s+", " ", red_abs)
        return red_abs.replace("\\r\\n\\t", "")
    # else, select specific 'three' lines based on the logical condition below.
    red_abs = red_abs +' '+ abs_list[1]

    if len_n % 2 == 0:
        red_abs = red_abs +' '+abs_list[int(len_n/2)]
    elif len_n % 2 != 0:
        red_abs = red_abs +' '+ abs_list[int((len_n+1)/2)]

    red_abs = red_abs +' '+ abs_list[len_n - 1]
    
    red_abs = re.sub("\s\s+", " ", red_abs)
    return red_abs.replace("\\r\\n\\t", "")

class SciFactAbstractDataset(Dataset):
    def __init__(self, corpus: str, dataset: str, vectorizer, doc_vectors):
        self.samples = []
        all_doc_ids = tfidf_scope(vectorizer, doc_vectors, dataset, k=30)
        corpus = {doc['doc_id']: doc for doc in jsonlines.open(corpus)}

        for i, data in enumerate(jsonlines.open(dataset)):
            scope = all_doc_ids[i]
            abstract_ids = [int(doc_id) for doc_id in list(data['evidence'].keys()) if int(doc_id) in scope]
            cited_ids = [int(doc_id) for doc_id in data['cited_doc_ids'] if doc_id in scope]
            non_abstract_ids = set(cited_ids) - set(abstract_ids)

            # add positive samples
            for doc_id in abstract_ids:
                doc = corpus[doc_id]
                title = doc['title']
                # Append reduced abstract representation
                red_abs = reduce_abstract(doc['abstract'])
                title = title+' '+red_abs
                self.samples.append({
                    'claim': data['claim'],
                    'title': title,
                    'evidence': 1
                })
                
            # add negative samples
            if len(non_abstract_ids) > 0:
                for doc_id in non_abstract_ids:
                    doc = corpus[doc_id]
                    title = doc['title']
                    # Append reduced abstract representation
                    red_abs = reduce_abstract(doc['abstract'])
                    title = title+' '+red_abs                    
                    self.samples.append({
                        'claim': data['claim'],
                        'title': title,
                        'evidence': 0
                    })

            # use up the rest of top 30 tfidf retrieval as negative samples
            rest = [doc_id for doc_id in scope if doc_id not in cited_ids]
            for doc_id in rest:
                doc = corpus[doc_id]
                title = doc['title']
                # Append reduced abstract representation
                red_abs = reduce_abstract(doc['abstract'])
                title = title+' '+red_abs                    
                self.samples.append({
                    'claim': data['claim'],
                    'title': title,
                    'evidence': 0
                })

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]
    
def tfidf_scope(vectorizer, doc_vectors, dataset, k=30):
    corpus = list(jsonlines.open('corpus.jsonl'))
    doc_id_ranks = []
    for data in jsonlines.open(dataset):
        claim = data['claim']
        claim_vector = vectorizer.transform([claim]).todense()
        doc_scores = np.asarray(doc_vectors @ claim_vector.T).squeeze()
        doc_indices_rank = doc_scores.argsort()[::-1].tolist()[:k]
        doc_id_rank = [corpus[idx]['doc_id'] for idx in doc_indices_rank]
        doc_id_ranks.append(doc_id_rank)

    return doc_id_ranks

In [None]:
vectorizer = TfidfVectorizer(stop_words='english',
                                 ngram_range=(1, 2))
doc_vectors = vectorizer.fit_transform([doc['title'] + ' ' + ' '.join(doc['abstract'])
                                            for doc in list(jsonlines.open('./data/corpus.jsonl'))])

In [None]:
cd ./data/
mkdir saved_models

trainset = SciFactAbstractDataset('corpus.jsonl', 'claims_train.jsonl', vectorizer, doc_vectors)
devset = SciFactAbstractDataset('corpus.jsonl', 'claims_dev.jsonl', vectorizer, doc_vectors)

In [None]:
def encode(claims: List[str], sentences: List[str]):
    encoded_dict = tokenizer.batch_encode_plus(
        zip(sentences, claims),
        pad_to_max_length=True,
        return_tensors='pt')
    if encoded_dict['input_ids'].size(1) > 512:
        # Too long for the model. Truncate it
        encoded_dict = tokenizer.batch_encode_plus(
            zip(sentences, claims),
            max_length=512,
            truncation_strategy='only_first',
            pad_to_max_length=True,
            return_tensors='pt')
    encoded_dict = {key: tensor.to(device) for key, tensor in encoded_dict.items()}
    return encoded_dict

def evaluate(model, dataset):
    model.eval()
    targets = []
    outputs = []
    with torch.no_grad():
        for batch in DataLoader(dataset, batch_size=1):
            encoded_dict = encode(batch['claim'], batch['title'])
            logits = model(**encoded_dict)[0]
            targets.extend(batch['evidence'].long().tolist())
            outputs.extend(logits.argmax(dim=1).long().tolist())

    return f1_score(targets, outputs, zero_division=0),\
            precision_score(targets, outputs, zero_division=0),\
            recall_score(targets, outputs, zero_division=0)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaForSequenceClassification.from_pretrained("roberta-large").to(device)

In [None]:
optimizer = torch.optim.Adam([
            {'params': model.roberta.parameters(), 'lr': 1e-5},
            {'params': model.classifier.parameters(), 'lr': 1e-3}
        ])

scheduler = get_cosine_schedule_with_warmup(optimizer, 0, 10)

In [None]:
for e in range(20):
    model.train()
    t = tqdm(DataLoader(trainset, batch_size=1, shuffle=True))
    for i, batch in enumerate(t):
        encoded_dict = encode(batch['claim'], batch['title'])
        loss, logits = model(**encoded_dict, labels=batch['evidence'].long().to(device))
        loss.backward()
        if (i + 1) % (128 // 1) == 0:
            optimizer.step()
            optimizer.zero_grad()
            t.set_description(f'Epoch {e}, iter {i}, loss: {round(loss.item(), 4)}')
    scheduler.step()
    train_score = evaluate(model, trainset)
    print(f'Epoch {e}, train f1: %.4f, precision: %.4f, recall: %.4f' % train_score)
    dev_score = evaluate(model, devset)
    print(f'Epoch {e}, dev f1: %.4f, precision: %.4f, recall: %.4f' % dev_score)
    save_path = os.path.join('./saved_models', f'abstract_retrieval-epoch-{e}-f1-{int(dev_score[0] * 1e4)}')
    os.makedirs(save_path)
    tokenizer.save_pretrained(save_path)
    model.save_pretrained(save_path)