# Requirements

In [None]:
!pip install textdistance nltk rouge werpy bert-score

In [2]:
import textdistance as td
from nltk.translate import meteor_score
from rouge import Rouge
from nltk.translate import bleu_score
import werpy
from bert_score import score

import math
import pandas as pd
import numpy as np
import warnings
pd.set_option("max_colwidth",1000)

In [None]:
def average_lst(lst):
  return sum(lst) / len(lst)

#<font color='red'>Prepare predictions and testcases</font>

In [None]:
df_test = pd.read_excel("test.xlsx", engine="openpyxl")
df_predict = pd.read_excel("predict.xlsx", engine="openpyxl")

In [None]:
df_test.columns = ['entities', 'value_type'] + [('o'+str(i+1)) for i in range(len(df_test.columns)-2)]
df_predict.columns = ['entities'] + [('o'+str(i+1)) for i in range(len(df_predict.columns)-1)]

In [None]:
applicant_index = 0

entitiy_list = df_test.iloc[:,applicant_index].tolist()
value_type_list = df_test.iloc[:,applicant_index+1].tolist()
test_list = df_test.iloc[:,applicant_index+2].tolist()

predict_list = df_predict.iloc[:,applicant_index+1].tolist()


# <font color='lime'>Metric Doc</font>

## <font color='lime'>Token Based Algorithms</font>

### **Sørensen-Dice similarity :**
The Sørensen–Dice coefficient measures the similarity between two sets or strings. (for each element in the set of words: 1=exact_match , 0=not_exact_match)

**Formula**: 2*|A∩B|/|A|+|B|

https://yassineelkhal.medium.com/the-complete-guide-to-string-similarity-algorithms-1290ad07c6b7

## <font color='lime'>Edit Based Algorithms</font>

### **Levenshtein Distance :**
Levenshtein distance measures the minimum number of edit operations (insertions, deletions, or substitutions) required to transform one string into another.

**Applicability**: It works well when comparing strings of different lengths.
### **Jaro Similarity :**
Jaro similarity measures the similarity between two strings based on matching characters and transpositions (swapping of characters) within a certain window.

**Applicability**: Primarily suited for short strings, such as person names.

https://yassineelkhal.medium.com/the-complete-guide-to-string-similarity-algorithms-1290ad07c6b7

## <font color='lime'>WER (Word Error Rate)</font>


WER measures the divergence between a candidate sentence and the reference by counting the minimum number of word-level edits (insertions, deletions, substitutions) needed to transform one into the other. Primarily used in automatic speech recognition tasks.

### **How It Works :**
Compares the candidate and reference sentences word by word.

Calculates the edit distance (Levenshtein distance) normalized by the reference length.

### **Formula :**
WER = (inserted + deleted + substituted)/total words in reference

https://towardsdatascience.com/foundations-of-nlp-explained-bleu-score-and-wer-metrics-1a5ba06d812b


## <font color='lime'>BLEU</font>


BLEU measures the precision of word n-grams (phrases) between a predicted text and a reference text.
It focuses on word overlap and doesn’t consider recall or semantic meaning.

**Precision** => This metric measures the number of words in the Predicted Sentence that also occur in the Target Sentence.
### **How is Bleu Score calculated?**
**1)** The first step is to compute Precision scores for 1-grams through 4-grams. (n1, n2, n3, n4)

**2)** Calculating **Geometric Average Precision Scores**. weights = (w1, w2, w3, w4)

    formula = (n1)^w1 . (n2)^w2 . (n3)^w3 . (n4)^w4

**3)** The third step is to compute a **Brevity Penalty**. this penalty penalizes sentences that are too short.

    if (c > r):
      brevity_penalty = 1
    else :
      brevity_penalty = e ^ (1-r/c)
    
    c is predicted length = number of words in the predicted sentence
    r is target length = number of words in the target sentence

this ensures that if you predict very few words, Brevity Penalty will be small.

**4)** Finally, to calculate the Bleu Score, we multiply the Brevity Penalty with the Geometric Average of the Precision Scores.

    bleu_score = (Brevity Penalty)*(Geometric Average Precision Scores)

https://towardsdatascience.com/foundations-of-nlp-explained-bleu-score-and-wer-metrics-1a5ba06d812b
_______________________________________________________________________
_______________________________________________________________________

## <font color='lime'>ROUGE</font>


ROUGE measures recall and precision of word n-grams and longest common subsequences between the predicted and referenced texts.

**Recall** => it quantifies the proportion of words or phrases in the Reference Sentence that also appear in the Predicted Sentence.

**Precision** => This metric measures the number of words or phrases in the Predicted Sentence that also occur in the Reference Sentence.

**F-Measure** => Combines precision and recall to provide a balanced view. It’s often reported as the harmonic mean of precision and recall.

ROUGE is mostly used for assessing automatic summarization and machine translation

### **ROUGE Variants :**
### ROUGE-N:
Measures n-gram overlap (unigram, bigram, trigram, etc.).

### ROUGE-L:
Measures the longest matching sequence of words using Longest Common Subsequence (LCS). It considers word order. Example:

**Longest Common Subsequence (LCS)** => ROUGE-L looks at the entire summary without caring about where lines break. It aims to find similar parts between the two versions.

https://medium.com/nlplanet/two-minutes-nlp-learn-the-rouge-metric-by-examples-f179cc285499

## <font color='lime'>METEOR</font>

METEOR modifies the precision and recall computations, replacing them with a weighted F-score based on mapping unigrams and a penalty function for incorrect word order.

### **step 1)** calculate Precision and Recall with matching words.

**Precision** => float(matches_count) / translation_length

**Recall** => float(matches_count) / reference_length

### **step 2)** Calculate weighted F-score using a parameter "alpha"

**Fmean (F-score)** => (precision * recall) / (alpha * precision + (1 - alpha) * recall)

### **step 3)** Calculate penalty using parameters "gamma" and "beta"

**penalty** => gamma * (chunk_count / matches_count) ** beta


## <font color='lime'>BertScore</font>


BERTScore leverages contextual embeddings from pre-trained BERT models to measure token similarity.

### **How It Works :**
BERTScore leverages pre-trained BERT embeddings.

It computes cosine similarity between words in candidate and reference sentences.

The resulting score reflects semantic similarity.

https://haticeozbolat17.medium.com/text-summarization-how-to-calculate-bertscore-771a51022964

#<font color='BLUE'>Exact Match</font>

In [None]:
# position sensitive
def compare_lists_exact_match(entities, value_types, predictions, references):
  results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","exact_match"])
  for i in range(len(predictions)):
    my_predict_str = str(predictions[i]).lower().split()
    my_reference_str = str(references[i]).lower().split()
    res = sum(p == r for p, r in zip(my_predict_str, my_reference_str))/max(len(my_reference_str),len(my_predict_str))
    results.loc[len(results)] = [
        entities[i],
        value_types[i],
        my_reference_str,
        my_predict_str,
        res
        ]
  avg = average_lst(results["exact_match"].tolist())
  return results, avg

In [None]:
# position insensitive
def compare_lists_sorensen(entities, value_types, predictions, references):
  results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","sorensen_score"])
  for i in range(len(predictions)):
    my_predict_str = str(predictions[i]).lower().split()
    my_reference_str = str(references[i]).lower().split()
    res = td.sorensen(my_predict_str, my_reference_str)
    # match_count = 0
    # for word_lower in my_predict_str:
    #     if word_lower in my_reference_str:
    #         my_reference_str.remove(word_lower)
    #         match_count += 1
    # res = match_count/len(my_reference_str)

    results.loc[len(results)] = [
          entities[i],
          value_types[i],
          my_reference_str,
          my_predict_str,
          res
          ]
  avg = average_lst(results["sorensen_score"].tolist())
  return results, avg

#<font color='yellow'>Text Distance</font>


In [None]:
def compare_lists_jaro(entities, value_types, predictions, references):
  results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","jaro_score"])
  for i in range(len(predictions)):
    my_predict_str = str(predictions[i])
    my_reference_str = str(references[i])
    res = td.jaro(my_predict_str.lower(), my_reference_str.lower())
    #res = sum(td.jaro(p.lower(), r.lower()) for p, r in zip(my_predict_str, my_reference_str))/len(my_reference_str)
    results.loc[len(results)] = [
        entities[i],
        value_types[i],
        my_reference_str,
        my_predict_str,
        round(res,4)
        ]
  avg = average_lst(results["jaro_score"].tolist())
  return results, avg

In [None]:
def compare_lists_levenshtein(entities, value_types, predictions, references):
  results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","levenshtein_score"])
  for i in range(len(predictions)):
    my_predict_str = str(predictions[i])
    my_reference_str = str(references[i])
    res = td.levenshtein(my_predict_str.lower(), my_reference_str.lower())
    # res = sum(td.levenshtein(p.lower(), r.lower()) for p, r in zip(my_predict_str, my_reference_str))/len(my_reference_str)
    results.loc[len(results)] = [
        entities[i],
        value_types[i],
        my_reference_str,
        my_predict_str,
        round(res,4)
        ]
  avg = average_lst(results["levenshtein_score"].tolist())
  return results, avg

#<font color='pink'>Cosine similarity</font>


In [None]:
# def compare_lists_cosine(entities, value_types, predictions, references):
#   results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","cosine_score"])
#   for i in range(len(predictions)):
#     my_predict_str = str(predictions[i])
#     my_reference_str = str(references[i])
#     r = td.cosine(my_predict_str.lower().split(), my_reference_str.lower().split())
#     results.loc[len(results)] = [
#         entities[i],
#         value_types[i],
#         my_reference_str,
#         my_predict_str,
#         round(r,4)
#         ]
#   avg = average_lst(results["cosine_score"].tolist())
#   return results, avg

In [None]:
# res , avg = compare_lists_cosine(predict_list, test_list)
# res

#<font color='LemonChiffon'>WER (Word Error Rate)</font>

In [None]:
def compare_lists_wer(entities, value_types, predictions, references):
  results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","wer_score"])
  for i in range(len(predictions)):
    my_predict_str = str(predictions[i]).lower()
    my_reference_str = str(references[i]).lower()
    wer = werpy.wer(my_reference_str, my_predict_str)
    results.loc[len(results)] = [
        entities[i],
        value_types[i],
        my_reference_str,
        my_predict_str,
        round(wer,4)
        ]
  avg = average_lst(results["wer_score"].tolist())
  return results, avg

In [None]:
# results,_ = compare_lists_wer(entitiy_list, value_type_list, predict_list, test_list)
# results

### Example :

In [None]:
# reference = "The cat is sleeping on the mat."
# hypothesis = "The cat is playing on mat."

# # Calculate the overall Word Error Rate
# wer_result = werpy.wer(reference, hypothesis)
# print(f"Word Error Rate: {wer_result:.3f}")

#<font color='orange'>BLEU</font>

In [None]:
def calculate_bleu(my_reference_str ,my_predict_str):
  # step 1) Precision scores for 1-grams through 4-grams.
  bleu_score_1gram = bleu_score.sentence_bleu([my_reference_str], my_predict_str,(1,0,0,0))
  bleu_score_2gram = bleu_score.sentence_bleu([my_reference_str], my_predict_str,(0,1,0,0))
  bleu_score_3gram = bleu_score.sentence_bleu([my_reference_str], my_predict_str,(0,0,1,0))
  bleu_score_4gram = bleu_score.sentence_bleu([my_reference_str], my_predict_str,(0,0,0,1))
  bleu_precisions = [round(bleu_score_1gram,4), round(bleu_score_2gram,4), round(bleu_score_3gram,4), round(bleu_score_4gram,4)]

  r = len(my_reference_str)
  c = len(my_predict_str)
  gram=0
  bleu = 0.0
  if(r==0 or c==0):
    print(f"length zero {i}th str")
    bleu_precisions = [None]*4
  elif(c==1 or r==1):
    bleu_precisions[1:] = [None]*3
    gram=1
  elif(c==2 or r==2):
    bleu_precisions[2:] = [None]*2
    gram=2
  elif(c==3 or r==3):
    bleu_precisions[3] = None
    gram=3
  else:
    gram=4
  # step 2) geometric_average_precision_scores
  weights = [1/gram]*gram
  geometric_average_precision_scores = sum((n**w) for n,w in zip(bleu_precisions[:gram] , weights))
  # step 3) brevity_penalty
  brevity_penalty = 1
  if (c <= r):
    brevity_penalty = math.exp(1-(r/c))
  # step 4) bleu_score
  bleu = geometric_average_precision_scores * brevity_penalty
  return bleu, bleu_precisions

In [None]:
from nltk.translate import bleu_score
def compare_lists_bleu(entities, value_types, predictions, references):
  compare_df = pd.DataFrame(columns = ["entity","value_type","actual","prediction","bleu_score","bleu_precisions"])
  results = []
  for i in range(len(predictions)):
    my_predict_str = str(predictions[i]).split()
    my_reference_str = str(references[i]).split()
    try:
      bleu, bleu_precisions = calculate_bleu(my_reference_str ,my_predict_str)
      best_bleu_possible, _ = calculate_bleu(my_reference_str ,my_reference_str)
    except:
      print("Problem with bleu!!!!!!!!!!!!!!")
      pass

    row = [entities[i], value_types[i], references[i], predictions[i], round(bleu/best_bleu_possible,4), bleu_precisions]
    compare_df.loc[len(compare_df)] = row

  # print(compare_df["bleu_score"])
  avg = average_lst(compare_df["bleu_score"].tolist())
  return compare_df, avg


In [None]:
# bleu_df,_ = compare_lists_bleu(entitiy_list, value_type_list, predict_list, test_list)
# bleu_df

### Example :

In [None]:
# from nltk.translate.bleu_score import corpus_bleu
# from nltk.translate import bleu_score
# ref = 'my first correct sentence'
# can = 'my sentence'

# score = bleu_score.corpus_bleu([ref.split()], [can.split()])
# print("a",score)
# score = bleu_score.sentence_bleu([ref.split()], can.split())
# print("b",score)

#<font color='aqua'>ROUGE</font>


In [None]:
# def compare_lists_rouge(entities, value_types, predictions, references):
#   scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
#   results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","rouge_precision","rouge_recall","rouge_f1"])
#   for i in range(len(predictions)):
#     my_predict_str = str(predictions[i])
#     my_reference_str = str(references[i])
#     scores = scorer.score(my_predict_str.lower(), my_reference_str.lower())
#     results.loc[len(results)] = [
#         entities[i],
#         value_types[i],
#         my_reference_str,
#         my_predict_str,
#         round(scores['rougeL'].precision,4),
#         round(scores['rougeL'].recall,4),
#         round(scores['rougeL'].fmeasure,4)
#         ]
#   avg = average_lst(results["rouge_precision"].tolist())
#   return results, avg

In [None]:
def compare_lists_rouge(entities, value_types, predictions, references):
  rouge = Rouge()
  results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","rouge_lcs_recall","rouge_lcs_precision","rouge_lcs_f1"])
  for i in range(len(predictions)):
    my_predict_str = str(predictions[i]).lower()
    my_reference_str = str(references[i]).lower()
    scores = rouge.get_scores(my_predict_str, my_reference_str)
    results.loc[len(results)] = [
        entities[i],
        value_types[i],
        my_reference_str,
        my_predict_str,
        round(scores[0]['rouge-l']['r'],4),
        round(scores[0]['rouge-l']['p'],4),
        round(scores[0]['rouge-l']['f'],4)
        ]
  avg = average_lst(results["rouge_lcs_f1"].tolist())
  return results, avg

In [None]:
# results, avg = compare_lists_rouge(entitiy_list, value_type_list, predict_list, test_list)
# results

### Example :

In [None]:
# # Candidate and reference sentences
# candidate = "the cat sat on"
# reference = "dog on the mat sat the cat"

# # Initialize ROUGE scorer
# rouge = Rouge()

# # Compute ROUGE scores
# scores = rouge.get_scores(candidate, reference)
# print(scores)
# print(f"ROUGE-1 F1 Score: {scores[0]['rouge-l']}")

#<font color='MediumSeaGreen'>METEOR</font>

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def compare_lists_meteor(entities, value_types, predictions, references):
  results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","meteor_f_score"])
  for i in range(len(predictions)):
    # print('hi')
    my_predict_str = str(predictions[i]).lower()
    my_reference_str = str(references[i]).lower()
    # print("predict : ",my_predict_str)
    # print("reference : ",my_reference_str)
    meteor = meteor_score.meteor_score([my_predict_str.split()], my_reference_str.split(), alpha=0.9)
    best_meteor = meteor_score.meteor_score([my_reference_str.split()], my_reference_str.split(), alpha=0.9)
    meteor_rate = None
    if best_meteor != 0:
      meteor_rate = meteor/best_meteor

    results.loc[len(results)] = [
        entities[i],
        value_types[i],
        my_reference_str,
        my_predict_str,
        round(meteor_rate,4)
        ]
  avg = average_lst(results["meteor_f_score"].tolist())
  return results, avg

In [None]:
# results, _ = compare_lists_meteor(entitiy_list, value_type_list, predict_list, test_list)
# results

### Example :

In [None]:
# candidate = "the cat sat on the mat"
# reference = "on the mat sat the cat"

# candidate = "3815 brendan lane apt no. 3 north olmsted, oh 44070"
# reference = "3815 brendan lane, apt no 3, north olmsted, oh, 44070"

# candidate = ""
# reference = "1"
# # Calculate METEOR score
# meteor = meteor_score.meteor_score([candidate.split()], reference.split(), alpha=0.9)
# best_meteor = meteor_score.meteor_score([reference.split()], reference.split(), alpha=0.9)
# meteor_rate = None
# if best_meteor != 0:
#   meteor_rate = round(meteor/best_meteor,4)
# print("METEOR Score:",meteor_rate)

#<font color='violet'>BERTscore</font>


In [None]:
def compare_lists_bert_score(entities, value_types, predictions, references):
  model = 'bert-base-uncased'
  language = "en"
  results = pd.DataFrame(columns = ["entity","value_type","actual","prediction","bert_precision","bert_recall","bert_f1"])
  for i in range(len(predictions)):
    my_predict_str = str(predictions[i]).lower()
    my_reference_str = str(references[i]).lower()
    bert_precision, bert_recall, bert_f1 = score([my_predict_str], [my_reference_str], lang=language, model_type=model)
    results.loc[len(results)] = [
        entities[i],
        value_types[i],
        my_reference_str,
        my_predict_str,
        np.round(bert_precision.mean().item(),4),
        np.round(bert_recall.mean().item(),4),
        np.round(bert_f1.mean().item(),4)
        ]
  avg = average_lst(results["bert_f1"].tolist())
  return results, avg

In [None]:
# results, avg = compare_lists_bert_score(entitiy_list, value_type_list, predict_list, test_list)
# results

### Example:

In [None]:
# candidate = ["the cat sat on the mat"]
# reference = ["on the mat sat the cat"]

# candidate = ["Stars shine, and the sky is covered with a blue blanket."]
# reference = ["Stars shine, and the sky is adorned with a bright color."]

# candidate  = ["The cat is on the mat."]
# reference = ["The dog is at the door."]

# candidate = ["The cat is sleeping."]
# reference = ["The cat rests."]

# # Compute BERTScore
# bert_precision, bert_recall, bert_f1 = score(candidate, reference, lang="en", model_type='bert-base-uncased')
# print(bert_f1.mean().item())
# print(f"BERTScore: {bert_f1.mean().item():.4f}")