<a href="https://colab.research.google.com/github/biodatlab/score-claim-extraction/blob/main/Claim_Extraction_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Claim Extraction Training Notebook
This notebook preprocesses the datasets and trains the models followed by performing evaluations.

## Setting Up

In [None]:
# Will need to restart runtime after running this cell as numpy version is changed
!pip install transformers==4.28.0
!pip install -U sentence-transformers
!pip install datasets
!pip install evaluate
!pip install accelerate -U
!pip install -U tensorflow-text
!pip install tf-models-official
!pip install -U spacy
!python -m spacy download en_core_web_sm
!pip install fuzzywuzzy
!pip install python-Levenshtein
!pip install tabulate


In [None]:
# Log into hugging face with your token
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
import os
import shutil

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import matplotlib.pyplot as plt
import spacy
from official.nlp import optimization

np.random.seed(54)
nlp = spacy.load("en_core_web_sm")
tf.get_logger().setLevel('ERROR')

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

## Type 1 Preprocessing

This part preprocesses Type 1 Data. It is designed for a Pandas Dataframe.
Essentially the dataframe needs 2 columns - 'abstract' and 'claim'

1. abstract (string) : This is the original abstract from the paper
2. claim (string) : Human-coded claim for the abstract. We'll fuzzy compare this to sentences in the abstract to find the claim

If you do not have human-coded data, you can skip this part

In [None]:
# Change 'train_data.csv' to the name of your csv file
data = pd.read_csv('train_data.csv')
data = data[['abstract','claim']]
data.head()

### Fuzzy comparision to match sentences to human-coded claim

In [None]:
from fuzzywuzzy import fuzz

datasets = data.to_dict("records")
type1_data = []
for r in datasets:
  annotations = []
  for sent in nlp(str(r["abstract"])).sents:
    label = int(fuzz.ratio(sent.text, r["claim"]) > 60)
    annotations.append({
      "text": sent.text,
      "label": label,
    })
  type1_data.append({
      "abstract": r["abstract"],
      "claim": r["claim"],
      "annotations": annotations,
  })

### Train-Test Split

In [None]:
from itertools import chain
from sklearn.model_selection import train_test_split

abstracts = [row['abstract'] for row in type1_data]
train_ids, val_ids = train_test_split(abstracts, test_size=0.2, random_state=54)
val_ids, test_ids = train_test_split(abstracts, test_size=0.5, random_state=54)
type1_train = list(chain.from_iterable([r["annotations"] for r in type1_data if r["abstract"] in train_ids]))
type1_val = list(chain.from_iterable([r["annotations"] for r in type1_data if r["abstract"] in val_ids]))
type1_test = list(chain.from_iterable([r["annotations"] for r in type1_data if r["abstract"] in test_ids]))

### Create datasets.Dataset() object and exporting

In [None]:
from datasets import load_dataset, Dataset

type1_train = Dataset.from_list(type1_train)
type1_val = Dataset.from_list(type1_val)
type1_test = Dataset.from_list(type1_test)

## Type 2 Data Loading

If your data is already preprocessed you can use this part of the code. It is designed for datasets.dataset() objects stored in csv files. Each file should have 2 columns - 'text' and 'label', in one split 'train'

- text (string) : A sentence from the abstract.
- label (int) : 1 if the text is a claim, 0 otherwise


In [None]:
type2_train = load_dataset('csv', data_files='type2_train.csv')
type2_val = load_dataset('csv', data_files='type2_val.csv')
type2_test = load_dataset('csv', data_files='type2_test.csv')

## Merging and Shuffling Datasets

In [None]:
##### Uncomment to concatenate type1 and type2 data
# from datasets import concatenate_datasets

# comb_train = concatenate_datasets([type1_train, type2_train['train']])
# comb_val = concatenate_datasets([type1_val, type2_val['train']])
# comb_test = concatenate_datasets([type1_test, type2_test['train']])

# If you are concatenating, comment these next 3 lines out
comb_train = type2_train
comb_val = type2_val
comb_test = type2_test

In [None]:
comb_train = comb_train.shuffle(seed=54)
comb_val = comb_val.shuffle(seed=54)
comb_test = comb_test.shuffle(seed=54)

## Model

### Model Initiation

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

In [None]:
from transformers import DataCollatorWithPadding

def preprocessor(batch):
    return tokenizer(batch['text'], truncation=True)


def get_collator(tokenizer):
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
  return data_collator

In [None]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
  preds, labels = eval_pred
  preds = np.argmax(preds,axis=1)
  return clf_metrics.compute(predictions=preds,references=labels)

In [None]:
#@title Select Hyperparameters

learning_rates = [2e-05,3e-05] #@param {type:'raw'}
num_epochs = 6 #@param {type:'integer'}
batch_size = 32 #@param {type:'integer'}
weight_decay=0.01 #@param {type:'number'}

### Fine-tuning

In [None]:
from transformers import TrainingArguments,Trainer
from transformers import BertForSequenceClassification

id2label = {0: "Null", 1: "Claim"}
label2id = {"Null": 0, "Claim": 1}


for learning_rate in learning_rates:
  model = BertForSequenceClassification.from_pretrained(
      "allenai/scibert_scivocab_uncased",
      num_labels=2,
      id2label=id2label,
      label2id=label2id,
      output_attentions=False,
      output_hidden_states=False,
  )
  
  model.cuda()

  model_name = f"scibert_claim_id_{learning_rate}"
  tokenizer = tokenizer
  preprocessor = preprocessor
  tokenized_train = comb_train.map(preprocessor, batched=True)
  tokenized_val = comb_val.map(preprocessor,batched=True)
  collator = get_collator(tokenizer)

  training_args = TrainingArguments(
      output_dir=model_name,
      learning_rate=learning_rate,
      per_device_train_batch_size=batch_size,
      per_device_eval_batch_size=batch_size,
      num_train_epochs=num_epochs,
      weight_decay=weight_decay,
      evaluation_strategy="epoch",
      save_strategy="epoch",
      load_best_model_at_end=True,
      push_to_hub=False,
  )

  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_train,
      eval_dataset=tokenized_val,
      tokenizer=tokenizer,
      data_collator=collator,
      compute_metrics=compute_metrics,
  )

  print(f'|--------------------------Now Training: {model_name} with Learning Rate = {learning_rate}------------------------------|')
  trainer.train()
  # trainer.push_to_hub()
  print(f'|-----------------------------------------------------------------------------------------------------------------------|')

## Evaluation

In [None]:
from evaluate import evaluator

task_evaluator = evaluator("text-classification")

In [None]:
# Change the next line to the model names that you trained
models = ["scibert_claim_id_2e-05","scibert_claim_id_3e_05"]

dataset = comb_test
evalres = []

for model_name in models:
  model = BertForSequenceClassification.from_pretrained(f"/content/{model_name}/")
  tokenizer = tokenizer
  preprocessor = preprocessor
  collator = get_collator(tokenizer)
  eval_results = task_evaluator.compute(
    model_or_pipeline=model,
    tokenizer=tokenizer,
    data=dataset,
    metric=evaluate.combine(["accuracy", "recall", "precision", "f1"]),
    label_mapping=label2id,
    strategy="simple"
  )
  evalres.append([model_name,data[0],eval_results['accuracy'],eval_results['recall'],eval_results['precision'],eval_results['f1']])

In [None]:
from tabulate import tabulate
print(tabulate(evalres,headers = ['Model Name','Dataset Name','Accuracy','Precision','Recall','F1 Score'],tablefmt='github'))