### Definition Generation Evaluation

In [1]:
import json
model = "vicuna"
# model = "llama"

# Load the JSON data
with open(f'./dataset/{model}_definition_combined.json', 'r') as json_file:
    data = json.load(json_file)

#### Dataset creation

In [2]:
# Initialize lists to store predictions and references
predictions = []
references = []
references_bluert = []

# Iterate through each item in the JSON data
for item in data:
  celex_id = item["celex_id"]
  if (item["existing_record"] == ["NEW TERM"]) and (item['generated_definition'] != 'NO JSON AS AN OUTPUT OBTAINED'):
    original_text = item['original_definition']
    term = item['term']
    new_term = f"'{term}'"
    original_text = original_text.replace(term, new_term)
    generated_text = item['generated_definition']

    if ((generated_text != "") and (generated_text != "NO JSON AS AN OUTPUT OBTAINED")):
      # Add the generated text to predictions list
      predictions.append(generated_text)

      # Add the original text to references list
      references_item = [original_text]  # References for this item
      references.append(references_item)

      references_bluert.append(original_text)

In [4]:
!pip install evaluate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


#### BLEU Score
- Reason for choosing
  
  - Length of the generated definitions matters
  - Precise definition is required


In [5]:
import evaluate
bleu = evaluate.load("bleu")

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Compute the BLEU score
bleu4_results = bleu.compute(predictions=predictions, references=references, max_order=4)
print(f"BLEU-4: {bleu4_results['bleu']}")

BLEU-4: 0.054211222286845544


In [7]:
# Compute the BLEU score
bleu3_results = bleu.compute(predictions=predictions, references=references, max_order=3)
print(f"BLEU-3: {bleu3_results['bleu']}")

BLEU-3: 0.08512705418500642


In [8]:
# Compute the BLEU score
blue2_results = bleu.compute(predictions=predictions, references=references, max_order=2)
print(f"BLEU-2: {blue2_results['bleu']}")

BLEU-2: 0.1470442594318926


In [9]:
# Compute the BLEU score
blue1_results = bleu.compute(predictions=predictions, references=references, max_order=1)
print(f"BLEU-1: {blue1_results['bleu']}")

BLEU-1: 0.27602616866006596


#### BERTScore

In [10]:
!pip install bert_score


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [11]:
import evaluate
bertscore = evaluate.load("bertscore")

In [12]:
bertscore_results = bertscore.compute(predictions=predictions, references=references, model_type="distilbert-base-uncased")

In [13]:
# Extract the F1 scores
f1_scores = bertscore_results['f1']

# Calculate the mean of F1 scores
mean_f1 = sum(f1_scores) / len(f1_scores)

print(f"BERTScore_F1: {bertscore_results['f1']}")
print("--------")
print(f"Mean BERTScore F1: {mean_f1}")

BERTScore_F1: [0.771040678024292, 0.802032470703125, 0.7775539755821228, 0.7847458720207214, 0.7986399531364441, 0.8132898807525635, 0.7352892756462097, 0.8096631765365601, 0.7902698516845703, 0.8768617510795593, 0.8628918528556824, 0.8763148784637451, 0.7551087737083435, 0.8277158737182617, 0.8119568228721619, 0.8192282915115356, 0.8277431130409241, 0.7887316942214966, 0.8555272817611694, 0.8275467753410339, 0.80133455991745, 0.7832514047622681, 0.896211564540863, 0.8665550947189331, 0.7223899960517883, 0.7245991826057434, 0.8335634469985962, 0.8275464177131653, 0.852604866027832, 0.8663516044616699, 0.7545173764228821, 0.7698800563812256, 0.8558621406555176, 0.8944920897483826, 0.8309657573699951, 0.8162767887115479, 0.8207405805587769, 0.8322715759277344, 0.857930600643158, 0.8114943504333496, 0.871566653251648, 0.8979578614234924, 0.8206650614738464, 0.79401034116745, 0.8262958526611328, 0.8130106329917908, 0.788678765296936, 0.8262680172920227, 0.7483661770820618, 0.71179884672164

In [14]:
# Extract the precision scores
precision_scores = bertscore_results['precision']

# Calculate the mean of precision scores
mean_precision = sum(precision_scores) / len(precision_scores)

print(f"BERTScore_precision: {bertscore_results['precision']}")
print("--------")
print(f"Mean BERTScore precision: {mean_precision}")

BERTScore_precision: [0.7887176871299744, 0.8370588421821594, 0.7987850904464722, 0.8010342717170715, 0.7869645953178406, 0.8483554124832153, 0.7378875017166138, 0.7802150845527649, 0.7707170844078064, 0.8836007118225098, 0.8689999580383301, 0.8641723394393921, 0.7800406813621521, 0.8114429712295532, 0.8122598528862, 0.8319946527481079, 0.8309544324874878, 0.786841094493866, 0.863391101360321, 0.8100599050521851, 0.7792434692382812, 0.7784606218338013, 0.8691303730010986, 0.8488653898239136, 0.7064251899719238, 0.764661967754364, 0.8789381980895996, 0.8054271340370178, 0.8960922956466675, 0.8569352626800537, 0.820817232131958, 0.8172978162765503, 0.8808125257492065, 0.8582308888435364, 0.8516297936439514, 0.8643320202827454, 0.8712472319602966, 0.8134428858757019, 0.815291166305542, 0.8049619197845459, 0.8907065987586975, 0.8696653842926025, 0.8090338706970215, 0.8145954608917236, 0.8455902934074402, 0.814582109451294, 0.8142266869544983, 0.7992973327636719, 0.7116180062294006, 0.72386

In [15]:
# Extract the recall scores
recall_scores = bertscore_results['recall']

# Calculate the mean of recall scores
mean_recall = sum(recall_scores) / len(recall_scores)

print(f"BERTScore_recall: {bertscore_results['recall']}")
print("--------")
print(f"Mean BERTScore recall: {mean_recall}")

BERTScore_recall: [0.754138708114624, 0.7698196768760681, 0.757422149181366, 0.7691068053245544, 0.8106669187545776, 0.7810081243515015, 0.7327093482017517, 0.8414214253425598, 0.8108405470848083, 0.8702247738838196, 0.856869101524353, 0.8888034820556641, 0.7317212820053101, 0.8446547985076904, 0.8116540312767029, 0.8068478107452393, 0.8245564699172974, 0.7906313538551331, 0.847805380821228, 0.8458054661750793, 0.8247146606445312, 0.7881015539169312, 0.9250347018241882, 0.8849977254867554, 0.7390928864479065, 0.688525378704071, 0.792643666267395, 0.8509148955345154, 0.8131430149078369, 0.8759771585464478, 0.6981275677680969, 0.7276627421379089, 0.8322862982749939, 0.9339526891708374, 0.8112806677818298, 0.7732836604118347, 0.7757689356803894, 0.851992666721344, 0.9052761793136597, 0.8181336522102356, 0.8532319664955139, 0.9281531572341919, 0.8326355218887329, 0.7744399309158325, 0.8078621625900269, 0.8114451766014099, 0.7646851539611816, 0.8551223278045654, 0.789116382598877, 0.7001321

#### BLEURT

In [16]:
!pip install git+https://github.com/google-research/bleurt.git

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /private/var/folders/rq/lmq0w5vs1_nfq3rrr7wpx3th0000gn/T/pip-req-build-7b_ariu9
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /private/var/folders/rq/lmq0w5vs1_nfq3rrr7wpx3th0000gn/T/pip-req-build-7b_ariu9
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25ldone
Collecting scipy
  Downloading scipy-1.12.0-cp39-cp39-macosx_12_0_arm64.whl (31.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting tensorflow
  Downloading tensorflow-2.16.1-cp39-cp39-macosx_12_0_arm64.whl (227.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.0/227.0 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[

In [17]:
import evaluate
bleurt = evaluate.load("bleurt", 'BLEURT-20')

Downloading builder script: 100%|██████████| 5.20k/5.20k [00:00<00:00, 17.5MB/s]
Downloading data: 100%|██████████| 2.14G/2.14G [01:19<00:00, 27.0MB/s]


INFO:tensorflow:Reading checkpoint /Users/ashishchouhan/.cache/huggingface/metrics/bleurt/BLEURT-20/downloads/extracted/5e239c156272523c25a71510af4ce974a3b2b17344901cd941228a3ecd2d36d0/BLEURT-20.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: /Users/ashishchouhan/.cache/huggingface/metrics/bleurt/BLEURT-20/downloads/extracted/5e239c156272523c25a71510af4ce974a3b2b17344901cd941228a3ecd2d36d0/BLEURT-20/sent_piece.model.
INFO:tensorf

INFO:tensorflow:BLEURT initialized.


In [18]:
# Compute the BLEURT score
bleurt_results = bleurt.compute(predictions=predictions, references=references_bluert)

In [19]:
# Extract the scores
scores = bleurt_results['scores']

# Calculate the mean of F1 scores
mean_bleurt = sum(scores) / len(scores)

In [20]:
print(f"BLEURT: {bleurt_results['scores']}")
print("--------")
print(f"Mean BLEURT: {mean_bleurt}")

BLEURT: [0.3606106638908386, 0.3854888677597046, 0.401622474193573, 0.6615625619888306, 0.4073513150215149, 0.45376765727996826, 0.38676947355270386, 0.45257604122161865, 0.4630216956138611, 0.691747784614563, 0.5683438181877136, 0.5162121057510376, 0.44876378774642944, 0.6059715151786804, 0.5735872387886047, 0.529415488243103, 0.4726722240447998, 0.4412979483604431, 0.4777666926383972, 0.481040894985199, 0.4596937298774719, 0.3912540674209595, 0.714878261089325, 0.6160227060317993, 0.3207806944847107, 0.4370354413986206, 0.5518521666526794, 0.46938949823379517, 0.6072700619697571, 0.6152352094650269, 0.3979171812534332, 0.45518457889556885, 0.45618778467178345, 0.5681602954864502, 0.5568466782569885, 0.36769235134124756, 0.40374159812927246, 0.4004210829734802, 0.520168662071228, 0.3059939742088318, 0.572227418422699, 0.6877575516700745, 0.6018601059913635, 0.4764571785926819, 0.4242071509361267, 0.567833662033081, 0.45518553256988525, 0.4765969514846802, 0.4106636047363281, 0.2047957