### Definition Generation Evaluation

In [1]:
import gc
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import json
# model = "llama-2"
model = "vicuna"

# Load the JSON data
with open(f'drive/My Drive/PhD/LexDrafter_Paper/dataset/final_{model}_dataset.json', 'r') as json_file:
    data = json.load(json_file)

In [24]:
# found_records = [item for item in data if item.get("term") == "drive with sinusoidal input current"]
# found_records

[{'term': 'drive with sinusoidal input current',
  'celex_id': '32019R1781',
  'original_definition': 'drive with sinusoidal input current means a VSD with a sinusoidal waveform of the input current, characterised by a Total Harmonic Content below 10 %;',
  'generated_definition': "'drive with sinusoidal input current' means a VSD with a sinusoidal waveform of the input current, characterized by a Total Harmonic Content below 10%.",
  'existing_sentences': {'Article 3': ['‘drive with sinusoidal input current’ means a VSD with a sinusoidal waveform of the input current, characterised by a Total Harmonic Content below 10 %;']},
  'existing_record': ['NEW TERM'],
  'scores': {'Article 3': {'article_score': 1, 'statement_scores': [1]}}}]

#### Dataset creation

In [5]:
# Examples:
# predictions = ["hello there general kenobi", "foo bar foobar"]
# references = [
#    ["hello there general kenobi", "hello there !"],
#    ["foo bar foobar"]
# ]
# Initialize lists to store predictions and references
predictions = []
references = []
# Iterate through each item in the JSON data
for item in data:
  celex_id = item["celex_id"]
  if item["existing_record"] == ["NEW TERM"]:
    original_text = item['original_definition']
    term = item['term']
    new_term = f"'{term}'"
    original_text = original_text.replace(term, new_term)
    generated_text = item['generated_definition']

    if ((generated_text != "") and (generated_text != "NO JSON AS AN OUTPUT OBTAINED")):
      # Add the generated text to predictions list
      predictions.append(generated_text)

      # Add the original text to references list
      references_item = [original_text]  # References for this item
      references.append(references_item)
    # else:
    #   print(f"term is {term} for celex_id: {celex_id}")
    #   input("Press Enter to continue to the next iteration...")

In [7]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━

#### BLEU Score
- Reason for choosing
  
  - Length of the generated definitions matters
  - Precise definition is required


In [8]:
import evaluate
bleu = evaluate.load("bleu")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [9]:
# Compute the BLEU score
bleu4_results = bleu.compute(predictions=predictions, references=references, max_order=4)
print(f"BLEU-4: {bleu4_results['bleu']}")

BLEU-4: 0.1837570668669643


In [10]:
# Compute the BLEU score
bleu3_results = bleu.compute(predictions=predictions, references=references, max_order=3)
print(f"BLEU-3: {bleu3_results['bleu']}")

BLEU-3: 0.21463045062072725


In [11]:
# Compute the BLEU score
blue2_results = bleu.compute(predictions=predictions, references=references, max_order=2)
print(f"BLEU-2: {blue2_results['bleu']}")

BLEU-2: 0.2602939269884561


In [12]:
# Compute the BLEU score
blue1_results = bleu.compute(predictions=predictions, references=references, max_order=1)
print(f"BLEU-1: {blue1_results['bleu']}")

BLEU-1: 0.35012966462552014


#### BERTScore

In [13]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=3.0.0 (from bert_score)
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers>=3.0.0->bert_score)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers>=3.0.0->bert_score)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x

In [14]:
import evaluate
bertscore = evaluate.load("bertscore")

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [15]:
bertscore_results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [16]:
# Extract the F1 scores
f1_scores = bertscore_results['f1']

# Calculate the mean of F1 scores
mean_f1 = sum(f1_scores) / len(f1_scores)

In [17]:
print(f"BERTScore: {bertscore_results['f1']}")
print("--------")
print(f"Mean BERTScore: {mean_f1}")

BERTScore: [0.7893481254577637, 0.8395156264305115, 0.7065520286560059, 0.9032556414604187, 0.7933623194694519, 0.8514114618301392, 0.8437336683273315, 0.8123737573623657, 0.869597315788269, 0.8514766097068787, 0.8150394558906555, 0.8129146099090576, 0.8722251653671265, 0.7925906181335449, 0.8105833530426025, 0.9143986701965332, 0.8894253969192505, 0.7788251638412476, 0.8122238516807556, 0.9918109178543091, 0.9909533858299255, 0.981783390045166, 0.9292769432067871, 0.7836194634437561, 0.798564612865448, 0.7462249994277954, 0.855783224105835, 0.8597813248634338, 0.8340144753456116, 0.9780644178390503, 0.9619945883750916, 0.8170099854469299, 0.9654687643051147, 0.819178581237793, 0.7479830384254456, 0.7646585702896118, 0.802691638469696, 0.8391741514205933, 0.8350458741188049, 0.7831259369850159, 0.7674673795700073, 0.74443119764328, 0.8301801681518555, 0.8558171987533569, 0.7622718214988708, 0.7788939476013184, 0.773999810218811, 0.8869292140007019, 0.8441084623336792, 0.740773797035217

#### (NOT USED) BERTScore

In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=3.0.0 (from bert_score)
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers>=3.0.0->bert_score)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers>=3.0.0->bert_score)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [9

In [None]:
!pip install datasets
from datasets import load_metric

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.14.

In [None]:
bertscore_metric = load_metric('bertscore')

  bertscore_metric = load_metric('bertscore')


Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

In [None]:
def bertscore_evaluation(generated_text, actual_text):
  bert_scores = bertscore_metric.compute(predictions=[generated_text], references=[actual_text], lang="en")
  return bert_scores['f1']

In [None]:
original_text = """
"import duty" means customs duty payable on the import of goods;"""



vicuna_text = """
Import duty is a tax or fee charged by a country on goods that are imported into that country. It is typically imposed to raise revenue for the government or to protect domestic industries from foreign competition.
"""

In [None]:
bertscore_value = bertscore_evaluation(vicuna_text, original_text)

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
bertscore_value

[0.8773407340049744]

#### (NOT USED) ROUGE
- Selected: https://github.com/danieldeutsch/sacrerouge/tree/master
- Refering to the paper: https://aclanthology.org/2023.acl-long.107.pdf

In [None]:
!pip install sacrerouge==0.2.5

In [None]:
!pip install urllib3==1.25.10
!apt install libxml-dom-perl

In [None]:
!sacrerouge setup-metric rouge

In [None]:
# import locale
# def getpreferredencoding(do_setlocale = True):
#     return "UTF-8"
# locale.getpreferredencoding = getpreferredencoding

In [None]:
from sacrerouge.metrics import Rouge

def rouge_evaluation(generated_text, actual_text):
  rouge = Rouge(max_ngram=4)
  scores = rouge.score(generated_text, [actual_text])
  return scores

In [None]:
original_text = """
"import duty" means customs duty payable on the import of goods;"""

In [None]:
values = rouge_evaluation(original_text, original_text)
values