In [71]:
import os
import sys
import json
import argparse
from sklearn.model_selection import KFold

import torch
import torchmetrics
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn.modules.loss import BCELoss
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, AutoConfig, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup


class AltTextSentenceDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        """
        Example data entry
        {
          "corpus_id": 14745329,
          "sent_id": 0,
          "text": "A graph of the latencies for each transcript (professional, automatic and crowd).",
          "labels": [
            1,
            0,
            0,
            0
          ]
        }
        """
        current_item = self.data[idx]
        text = current_item['text']
        token_ids = self.tokenizer.encode(text, max_length=512, truncation=True)
        labels = current_item['labels']
        labels_float = [float(l) for l in labels]

        return {
            "text": token_ids,
            "labels": labels,
            "labels_float": labels_float
        }

    @staticmethod
    def collate_fn(data):
        token_ids = [torch.tensor(entry["text"]) for entry in data]
        labels = [torch.tensor(entry["labels"]) for entry in data]
        labels_float = [torch.tensor(entry["labels_float"]) for entry in data]
        token_ids_tensor = torch.nn.utils.rnn.pad_sequence(token_ids, batch_first=True, padding_value=0)
        labels_tensor = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
        labels_float_tensor = torch.nn.utils.rnn.pad_sequence(labels_float, batch_first=True, padding_value=-100)
        return {
            "input_ids": token_ids_tensor,
            "labels": labels_tensor,
            "labels_float": labels_float_tensor
        }


class DataModule(LightningDataModule):
    def __init__(
            self,
            model_name_or_path: str,
            train_file: str,
            val_file: str,
            pred_file: str,
            max_seq_length: int = 512,
            batch_size: int = 4,
            **kwargs,
    ):
        super().__init__()
        self.train_file = train_file
        self.val_file = val_file
        self.pred_file = pred_file
        self.model_name_or_path = model_name_or_path
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name_or_path, use_fast=True, max_length=max_seq_length
        )

    def setup(self, stage="fit"):
        def load_data(path):
            data = []
            with open(path) as fin:
                for line in fin:
                    data.append(json.loads(line))
            return data

        if stage == 'fit':
            train_data = load_data(self.train_file)
            self.train_dataset = AltTextSentenceDataset(train_data, self.tokenizer)
            val_data = load_data(self.val_file)
            self.val_dataset = AltTextSentenceDataset(val_data, self.tokenizer)
            
        if stage == 'validate':
            val_data = load_data(self.val_file)
            self.val_dataset = AltTextSentenceDataset(val_data, self.tokenizer)

        if stage == 'predict':
            pred_data = load_data(self.pred_file)
            self.pred_dataset = AltTextSentenceDataset(pred_data, self.tokenizer)

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            collate_fn=AltTextSentenceDataset.collate_fn,
            num_workers=1
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            collate_fn=AltTextSentenceDataset.collate_fn,
            num_workers=1
        )

    def predict_dataloader(self):
        return DataLoader(
            self.pred_dataset,
            batch_size=self.batch_size,
            collate_fn=AltTextSentenceDataset.collate_fn,
            num_workers=1
        )

class TransformerModule(LightningModule):
    def __init__(
            self,
            model_name_or_path: str,
            num_labels: int = 4,
            learning_rate: float = 3e-5,
            adam_epsilon: float = 1e-8,
            warmup_steps: int = 0,
            weight_decay: float = 0.0,
            max_seq_length: int = 512,
            batch_size: int = 4,
            **kwargs,
    ):
        super().__init__()
        self.save_hyperparameters()
        self.config = AutoConfig.from_pretrained(model_name_or_path)
        self.model = AutoModel.from_pretrained(model_name_or_path, config=self.config)
        self.classifier = torch.nn.Linear(768, 4)
        self.sigmoid = torch.nn.Sigmoid()
        self.loss_fn = BCELoss()
        self.metric_acc = torchmetrics.Accuracy()
        self.metric_f1 = torchmetrics.F1()
        self.num_labels = num_labels
        self.batch_size = batch_size
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name_or_path, use_fast=True, max_length=max_seq_length,
        )

    def forward(self, **inputs):
        # inputs['input_ids'].shape -> [batch_size, max_len]
        output = self.model(inputs["input_ids"])
        # cls_output_state.shape -> [batch_size, 768]
        cls_output_state = output["last_hidden_state"][inputs["input_ids"] == self.tokenizer.cls_token_id]
        # logits.shape -> [batch_size, num_labels] -> [num_labels * batch_size]
        logits = self.classifier(cls_output_state)
        probs = self.sigmoid(logits)
        probs_flat = probs.view(-1)
        # labels_flat.shape -> [num_labels * batch_size]
        labels = inputs["labels"]
        # print("labels",labels)
        if 'labels_float' not in inputs:
            temp_float = [float(l) for l in labels]
            inputs['labels_float']=torch.tensor(temp_float)
            # print("labels_float", inputs["labels_float"])
        labels_float = inputs["labels_float"]
        labels_flat = labels_float.view(-1)
        loss = self.loss_fn(probs_flat, labels_flat)
        return loss, probs, labels

    def training_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs[0]
        acc = self.metric_acc(outputs[1].view(-1), outputs[2].view(-1))
        self.log("loss", loss)
        self.log("acc", acc)
        return loss

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, probs, labels = outputs
        preds = torch.round(probs)
        self.log("val_loss", val_loss)
        # self.log("acc", acc)
        return {"loss": val_loss, "val_loss": val_loss, "preds": preds, "labels": labels}

    def validation_epoch_end(self, outputs):
        preds = torch.cat([x["preds"] for x in outputs]).detach().cpu()
        labels = torch.cat([x["labels"] for x in outputs]).detach().cpu()
        loss = torch.stack([x["loss"] for x in outputs]).mean()
        self.log("val_loss", loss)
        val_acc = self.metric_acc(preds.view(-1), labels.view(-1))
        val_f1 = self.metric_f1(preds.view(-1), labels.view(-1))
        self.log("val_acc", val_acc, prog_bar=True)
        self.log("val_f1", val_f1, prog_bar=True)
        return loss

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        _, probs, _ = outputs
        preds = torch.round(probs)
        return {"preds": preds}
        
    def setup(self, stage=None) -> None:
        if stage != "fit":
            return
        train_loader = self.train_dataloader()
        tb_size = self.hparams.batch_size * max(1, len(self.trainer.gpus) if self.trainer.gpus else 0)
        ab_size = self.trainer.accumulate_grad_batches * float(self.trainer.max_epochs)
        self.total_steps = (len(train_loader.dataset) // tb_size) // ab_size

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.total_steps,
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]


def train(model, train_file, val_file, pred_file, outdir, logname, device_type, devices):
    dm = DataModule(train_file=train_file, val_file=val_file, pred_file=pred_file, model_name_or_path=model)
    dm.setup(stage="fit")
    model = TransformerModule(warmup_steps=200, model_name_or_path=model)
    logger = TensorBoardLogger(outdir, name=logname)
    print(os.path.join(outdir, 'checkpoints'))
    checkpoint_callback = ModelCheckpoint(
        dirpath=os.path.join(outdir, 'checkpoints'),
        save_top_k=1,
        verbose=True,
        monitor='val_loss',
        mode='min'
    )
    trainer = Trainer(
        accelerator=device_type,
        devices=devices,
        progress_bar_refresh_rate=5,
        max_epochs=15,
        default_root_dir=outdir,
        logger=logger,
        callbacks=[checkpoint_callback]
    )
    trainer.fit(model, dm)

    dm.setup(stage="validate")

    trainer.validate(model, dm.val_dataloader())

    dm.setup(stage="predict")
    predictions = trainer.predict(model, dm.predict_dataloader())
    pred_list = []
    for pred in predictions:
        pred_list.append(pred['preds'].cpu())
    pred_tensor = torch.cat(pred_list, dim = 0)
    return pred_tensor

# def predict(model, model_path, pred_file):
#     model = TransformerModule(model_name_or_path=model).load_from_checkpoint(model_path)
#     dm = DataModule(model_name_or_path=model, train_file=None, val_file=None, pred_file=pred_file)
#     dm.setup(stage="predict")
#     trainer = Trainer(
#         accelerator=device_type,
#         devices=devices,
#         progress_bar_refresh_rate=5,)
#     predictions = trainer.predict(model, dm)
#     pred_list = []
#     for pred in predictions:
#         pred_list.append(pred['preds'])

#     pred_tensor = torch.cat(pred_list, dim = 0)
#     return pred_tensor


## Model trained on all data

In [72]:
# # parser = argparse.ArgumentParser()
# parser.add_argument("--model", type=str, help="Name of model")
# parser.add_argument("--data", type=str, help="Path to data directory")
# parser.add_argument("--outdir", type=str, help="Path to save output")
# parser.add_argument("--preds", type=str, help="Path to process file for predictions")

# args = parser.parse_args()

model_name = 'allenai/scibert_scivocab_uncased'
data_dir = 'all_data/'
out_dir = os.path.join('all_data/', model_name.replace('/', '_'))
print(out_dir)
pred_file = 'only_first.jsonl'

if not os.path.exists(data_dir):
    print('Data path does not exist!')
    sys.exit(-1)
os.makedirs(out_dir, exist_ok=True)

# check if GPUs available
gpu_count = torch.cuda.device_count()
if gpu_count == 0:
    device_type = 'cpu'
    devices = None
else:
    device_type = 'gpu'
    devices = [0]

predictions = torch.zeros(10, 4)
train_file = os.path.join(data_dir, 'train.jsonl')
val_file = os.path.join(data_dir, 'val.jsonl')
print(train_file, val_file)
if not os.path.exists(train_file):
    raise FileNotFoundError(f"{train_file} not found!")
if not os.path.exists(val_file):
    raise FileNotFoundError(f"{val_file} not found!")
out_subdir = out_dir
os.makedirs(out_subdir, exist_ok=True)
logger_name = 'logs'
predictions+=train(model_name, train_file, val_file, pred_file, out_subdir, logger_name, device_type, devices)

all_data/allenai_scibert_scivocab_uncased
all_data/train.jsonl all_data/val.jsonl


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

all_data/allenai_scibert_scivocab_uncased/checkpoints


Validation sanity check: 0it [00:00, ?it/s]

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 419: val_loss reached 0.23826 (best 0.23826), saving model to "all_data/allenai_scibert_scivocab_uncased/checkpoints/epoch=0-step=419-v1.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 839: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 1259: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 1679: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 2099: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 2519: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 2939: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, global step 3359: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, global step 3779: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 9, global step 4199: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 10, global step 4619: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 11, global step 5039: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 12, global step 5459: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 13, global step 5879: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 14, global step 6299: val_loss was not in top 1
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validating: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 VALIDATE RESULTS
{'val_acc': 0.9213759303092957,
 'val_f1': 0.853881299495697,
 'val_loss': 0.23826251924037933}
--------------------------------------------------------------------------------


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 420it [00:00, ?it/s]

## 5 fold cross validation model

In [None]:
# # parser = argparse.ArgumentParser()
# parser.add_argument("--model", type=str, help="Name of model")
# parser.add_argument("--data", type=str, help="Path to data directory")
# parser.add_argument("--outdir", type=str, help="Path to save output")
# parser.add_argument("--preds", type=str, help="Path to process file for predictions")

# args = parser.parse_args()

model_name = 'allenai/scibert_scivocab_uncased'
data_dir = 'data_splits/'
out_dir = os.path.join('.', model_name.replace('/', '_'))
pred_file = 'only_first.jsonl'

if not os.path.exists(data_dir):
    print('Data path does not exist!')
    sys.exit(-1)
os.makedirs(out_dir, exist_ok=True)

# check if GPUs available
gpu_count = torch.cuda.device_count()
if gpu_count == 0:
    device_type = 'cpu'
    devices = None
else:
    device_type = 'gpu'
    devices = [0]

In [None]:
predictions = torch.zeros(10, 4)
# get folds
for i in range(5):
    print(f'Fold {i}')
    train_file = os.path.join(data_dir, f'{i:02d}', 'train.jsonl')
    val_file = os.path.join(data_dir, f'{i:02d}', 'val.jsonl')
    if not os.path.exists(train_file):
        raise FileNotFoundError(f"{train_file} not found!")
    if not os.path.exists(val_file):
        raise FileNotFoundError(f"{val_file} not found!")
    out_subdir = os.path.join(out_dir, f'fold_{i:02d}')
    os.makedirs(out_subdir, exist_ok=True)
    logger_name = 'logs'
    predictions+=train(model_name, train_file, val_file, pred_file, out_subdir, logger_name, device_type, devices)

In [None]:
# for i in range(5):
#     out_subdir = os.path.join(out_dir, f'fold_{i:02d}', 'checkpoints')
#     filename = os.listdir(out_subdir)[0]
#     print("checkpoint",out_subdir)
#     model_path = out_subdir +'/'+filename
#     model_name = "allenai/scibert_scivocab_uncased"
#     print("model path", model_path)
#     predictions+=predict(model = model_name, model_path = model_path, pred_file = pred_file).cpu()

pred_index = predictions/5
# print(pred_index)
print(f"Total Examples: {pred_index.shape[0]}")
print(f"Level 1 Percentage: {torch.sum(pred_index[:, 0])/pred_index.shape[0]}")
print(f"Level 2 Percentage: {torch.sum(pred_index[:, 1])/pred_index.shape[0]}")
print(f"Level 3 Percentage: {torch.sum(pred_index[:, 2])/pred_index.shape[0]}")
print(f"Level 4 Percentage: {torch.sum(pred_index[:, 3])/pred_index.shape[0]}")
print('done.')

In [4]:
model_name = "allenai/scibert_scivocab_uncased"
out_dir = os.path.join('.', model_name.replace('/', '_'))
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, max_length=512)
sentence = {"corpus_id": 1000, "sent_id": 0, "text": "Increasing from 0% of the population, both plots start at 100% differentiable and gradually fall to 80% differentiable at 75% of the population", "labels": [-1, -1, -1, -1]}
tokenized_input = tokenizer.encode(sentence['text'], max_length=512)
input_ids = torch.tensor([tokenized_input])
print(sentence)
aggregated_predictions = torch.zeros(1, 4) #length of sentence

for i in range(5):
    out_subdir = os.path.join(out_dir, f'fold_{i:02d}', 'checkpoints')
    filename = os.listdir(out_subdir)[0]
    model_path = out_subdir +'/'+filename
    # print("model path", model_path)
    model = TransformerModule.load_from_checkpoint(model_path)
    model.eval() 
    with torch.no_grad():
        all = model(input_ids=input_ids, labels=sentence['labels'])
        # print(all)
        logits = all[1]
        aggregated_predictions += torch.round(logits)

average_predictions = aggregated_predictions / 5

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'corpus_id': 1000, 'sent_id': 0, 'text': 'Increasing from 0% of the population, both plots start at 100% differentiable and gradually fall to 80% differentiable at 75% of the population', 'labels': [-1, -1, -1, -1]}


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(tensor(-0.3669), tensor([[0.1911, 0.6914, 0.6671, 0.0174]]), [-1, -1, -1, -1])


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(tensor(-1.2254), tensor([[0.0707, 0.3046, 0.6819, 0.0205]]), [-1, -1, -1, -1])


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(tensor(0.4609), tensor([[0.1916, 0.6992, 0.8915, 0.0343]]), [-1, -1, -1, -1])


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(tensor(-0.6819), tensor([[0.1409, 0.4799, 0.6503, 0.0339]]), [-1, -1, -1, -1])


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(tensor(-0.1918), tensor([[0.0540, 0.8409, 0.6925, 0.0298]]), [-1, -1, -1, -1])


In [5]:
average_predictions

tensor([[0.0000, 0.6000, 1.0000, 0.0000]])