# Extracting sentences using MatBERT

In [1]:
from utils import *
import os
# import sys
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from torch.utils.data import DataLoader
from datasets import Dataset
from dotenv import load_dotenv

# import warnings
# from torch.optim import AdamW
# from transformers import get_scheduler
# from transformers.utils import logging
# import evaluate
# from torch.utils.data import WeightedRandomSampler

load_dotenv(dotenv_path=".env.local", override=True)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

First we load the data. Here, `corpus_all_paragraphs` should be a dictionnary with the dois as keys. Each paper is thus separated. For each entry, the dictionnary should contain a list of paragraphs in the form of dicionnaries with, at least, a 'text' entry.

Schematically, the data should looks like this:

{

doi1: [{'text': "Some text"}, {'text': "Some text"}, {'text': "Some text"}],
    
doi2: [...],
    
...

}

In [2]:
# Load data
with open("data_by_corpus/papers/all_paragraphs.json", 'r') as f:
    corpus_all_paragraphs = json.load(f)

In [3]:
# Load tokenizer and MatBERT
token = os.getenv("MATBERT_TOKEN")
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-SST-2", use_auth_token=token)
model = AutoModelForSequenceClassification.from_pretrained("textattack/bert-base-uncased-SST-2", use_auth_token=token).to(device)

# Create custom sentence tokenizer with abbreviations
punkt_param = PunktParameters()
abbreviation = ['fig', 'al', 'e.g', 'i.e']
punkt_param.abbrev_types = set(abbreviation)
sentence_tokenizer = PunktSentenceTokenizer(punkt_param)



pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Here we loop over every paper and paragraph and evaluate our MatBERT model on each sentence as tokenized by `sentence_tokenizer`. If the sentence's logit prediction is 1, then we add that to `extracted_sentences`. The latter is a list of dictionnaries with each dict containing the keys `sentences`, `doi`, and `paragraph`. `sentences` is a list of sentences extracted from `paragraph` of the paper with doi `doi`.

In [4]:
extracted_sentences = []

all_dois = list(corpus_all_paragraphs.keys())

for i, doi in enumerate(all_dois):
    for para in corpus_all_paragraphs[doi]:

        # To prevent extracting the same paragraph twice
        if len(extracted_sentences)>0:
            if para['_id'] in [s['paragraph']['_id'] for s in extracted_sentences]:
                continue

        # Some paragraphs could be empty
        if len(para['text'])==0:
            continue

        # Get all the sentences in the paper
        sentences = sentence_tokenizer.tokenize(para['text'])

        # Prepare sentences to pass in model (tokenize them)
        sent_tok = tokenizer(sentences, padding=True, truncation=True, max_length=512)
        sent_tok['sentences'] = sentences
        test_data = Dataset.from_dict(sent_tok)
        test_data.set_format('torch')
        test_dataloader = DataLoader(test_data, batch_size=8)

        # Evaluate model
        predictions = []
        model.eval()
        for batch in test_dataloader:
            batch_sentences = batch['sentences']
            batch = {k: v.to(device) for k, v in batch.items() if torch.is_tensor(v)}
            with torch.no_grad():
                outputs = model(**batch)

            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=-1).cpu().tolist())

        # If we have at least one good sentence, add it/them to list
        if sum(predictions)>0:
            to_add = {}
            to_add['sentences'] = np.array(sentences)[np.array(predictions)==1].tolist()
            to_add['doi'] = doi
            to_add['paragraph'] = para
            extracted_sentences.append(to_add)
            break

with open('extracted_sentences.json', 'w') as f:
    json.dump(extracted_sentences, f, indent=2)

In [5]:
from utils import *

# Training best BERT model
The manually labeled data used for hyperparameter tuning of various BERT models can be found in "data/manually_labelled_sentences.csv". Each sentence has a label of either 1 for extraction or 0 for non extraction.£

In [6]:
# pd.read_csv("data/manually_labelled_sentences.csv", index_col=0)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]