In [1]:
from transformers import pipeline
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

import evaluate
from evaluate import evaluator
# Load datasets
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


# Dataset

ACL-ARC from here: https://github.com/allenai/scicite

In [20]:
train_dataset = load_dataset('json', data_files='acl-arc/train.jsonl', split="train")
valid_dataset = load_dataset('json', data_files='acl-arc/test.jsonl', split="train")

# Helper Functions

In [6]:
def get_classifier(model_checkpoint):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
    classifier = pipeline('text-classification', model=model, tokenizer=tokenizer)
    return classifier

# Model: MultiCite

In [21]:
multicite = get_classifier('allenai/multicite-multilabel-scibert')

In [8]:
# test 
multicite("We use this as background")

[{'label': 'uses', 'score': 0.9840075373649597}]

In [10]:
predictions = multicite(valid_dataset['text'])

In [11]:
set([valid_dataset[i]['intent'] for i in range(len(valid_dataset))])

{'Background', 'CompareOrContrast', 'Extends', 'Future', 'Motivation', 'Uses'}

In [12]:
accuracy = evaluate.load('accuracy')

references=[valid_dataset[i]['intent'].lower() for i in range(len(valid_dataset))]
predictions=[result['label'] for result in predictions]

# Assuming 'labels_dict' maps unique string labels to integers
labels_dict = {'background': 0, 'uses': 1, 'motivation': 2,
               'extends': 3, 'differences': 4, 'similarities':4,
               'compareorcontrast': 4, 'future_work': 5, 'future': 5,}  

# Convert 'references' and 'predictions' to integer labels
references_int = [labels_dict[label] for label in references]
predictions_int = [labels_dict[pred] for pred in predictions]

acc = accuracy.compute(
    references=references_int, 
    predictions=predictions_int)
print(acc)

{'accuracy': 0.6618705035971223}


# Combined Metric

In [None]:
clf_metrics = evaluate.combine(['accuracy', 'f1', 'precision', 'recall'])
clf_metrics.compute(references=references_int, predictions=predictions_int)

labels_dict = {'background': 0, 'uses': 1, 'motivation': 2,
               'extends': 3, 'differences': 4, 'similarities':4,
               'CompareOrContrast': 4, 'CompareOrContrast':4, 'future_work': 5, 'future': 5,
               'Background':0, 'Extends':3, 'Future':5, 'Motivation':2, 'Uses':1}  

task_evaluator = evaluator(task='text-classification')
acc = task_evaluator.compute(model_or_pipeline = classifier, 
                             data=valid_dataset,
                             metric = "accuracy",  #clf_metrics, 
                             label_mapping = labels_dict,
                             label_column="intent")

# SciBERT

In [22]:
scibert = get_classifier('allenai/scibert_scivocab_uncased')

# test
scibert("We use this as background")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'label': 'LABEL_1', 'score': 0.7157976031303406}]

In [32]:
dataset = load_dataset('yxchar/citation_intent-tlm', split="train")
results = scibert(dataset['text'])
print(results)

[{'label': 'LABEL_1', 'score': 0.5574931502342224}, {'label': 'LABEL_1', 'score': 0.5810647010803223}, {'label': 'LABEL_0', 'score': 0.578150749206543}, {'label': 'LABEL_1', 'score': 0.6656164526939392}, {'label': 'LABEL_1', 'score': 0.5358041524887085}, {'label': 'LABEL_1', 'score': 0.5122213959693909}, {'label': 'LABEL_0', 'score': 0.523943305015564}, {'label': 'LABEL_1', 'score': 0.569559633731842}, {'label': 'LABEL_0', 'score': 0.5583556294441223}, {'label': 'LABEL_0', 'score': 0.5452756881713867}, {'label': 'LABEL_1', 'score': 0.6105943918228149}, {'label': 'LABEL_1', 'score': 0.5139060020446777}, {'label': 'LABEL_1', 'score': 0.5250560641288757}, {'label': 'LABEL_1', 'score': 0.5606197118759155}, {'label': 'LABEL_1', 'score': 0.574708104133606}, {'label': 'LABEL_1', 'score': 0.5272637605667114}, {'label': 'LABEL_1', 'score': 0.5452248454093933}, {'label': 'LABEL_0', 'score': 0.5155790448188782}, {'label': 'LABEL_1', 'score': 0.5063885450363159}, {'label': 'LABEL_1', 'score': 0.55

In [43]:
temp = Counter(dataset['label'])
print(temp)

Counter({0: 867, 1: 317, 2: 305, 4: 76, 3: 63, 5: 60})


In [40]:
clf_metrics = evaluate.combine(['accuracy', 'f1', 'precision', 'recall'])
#clf_metrics.compute(references=references_int, predictions=predictions_int)

labels_dict = {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2,
               'LABEL_3': 3, 'LABEL_4': 4, 'LABEL_5':5,}  

task_evaluator = evaluator(task='text-classification')
acc = task_evaluator.compute(model_or_pipeline = scibert, 
                             data=dataset,
                             metric = evaluate.load("accuracy"), #clf_metrics, 
                             label_mapping = labels_dict,
                             label_column="label")

In [41]:
acc

{'accuracy': 0.31398104265402843,
 'total_time_in_seconds': 56.985338915954344,
 'samples_per_second': 29.62165413264579,
 'latency_in_seconds': 0.03375908703551798}

In [15]:
from collections import Counter
predictions =[result['label'] for result in results]

prediction_counts = Counter(predictions)
print(prediction_counts)

Counter({'LABEL_0': 136, 'LABEL_1': 3})


# Model: yxchar/tlm-citation_intent-small-scale

In [16]:
yxchar = get_classifier('yxchar/tlm-citation_intent-small-scale')
results = yxchar(valid_dataset['text'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at yxchar/tlm-citation_intent-small-scale and are newly initialized: ['bert.pooler.dense.bias', 'classifier.bias', 'bert.pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
predictions =[result['label'] for result in results]

prediction_counts = Counter(predictions)
print(prediction_counts)

Counter({'LABEL_0': 123, 'LABEL_1': 16})


# Model: Roberta

In [18]:
roberta_1 = get_classifier('allenai/dsp_roberta_base_tapt_citation_intent_1688')
results = roberta_1(valid_dataset['text'])

predictions =[result['label'] for result in results]
prediction_counts = Counter(predictions)
print(prediction_counts)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allenai/dsp_roberta_base_tapt_citation_intent_1688 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({'LABEL_1': 139})


In [19]:
roberta_2 = get_classifier('allenai/dsp_roberta_base_dapt_cs_tapt_citation_intent_1688')
results = roberta_2(valid_dataset['text'])

predictions =[result['label'] for result in results]
prediction_counts = Counter(predictions)
print(prediction_counts)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allenai/dsp_roberta_base_dapt_cs_tapt_citation_intent_1688 and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Counter({'LABEL_0': 139})
