In [1]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from bert_score import score
from sentence_transformers import SentenceTransformer, util

# Load the data
df = pd.read_csv("../data/bias.csv")
df = df[["Case", "Summary", "Reference"]]

# Ensure nltk is installed
nltk.download('punkt')

# Define functions for BLEU, ROUGE-L, and BERTScore
def calculate_bleu(reference, summary):
    reference_tokens = nltk.word_tokenize(reference)
    summary_tokens = nltk.word_tokenize(summary)
    return sentence_bleu([reference_tokens], summary_tokens)

def calculate_rouge(reference, summary):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(reference, summary)
    return scores['rougeL'].fmeasure

def calculate_bertscore(reference, summary):
    P, R, F1 = score([summary], [reference], lang="en", verbose=True)
    return F1.item()

# Group by 'Case'
grouped = df.groupby('Case')

# Initialize lists to hold the scores
bleu_scores = []
rouge_scores = []
bert_scores = []

# Process each group
for name, group in grouped:
    references = group[group['Reference'] == 1.0]['Summary']
    if not references.empty:
        reference_text = references.iloc[0]
        for _, row in group.iterrows():
            if row['Reference'] == 0.0:
                summary_text = row['Summary']
                bleu = calculate_bleu(reference_text, summary_text)
                rouge = calculate_rouge(reference_text, summary_text)
                bert = calculate_bertscore(reference_text, summary_text)
                bleu_scores.append(bleu)
                rouge_scores.append(rouge)
                bert_scores.append(bert)
            else:
                bleu_scores.append(None)
                rouge_scores.append(None)
                bert_scores.append(None)
    else:
        bleu_scores.extend([None] * len(group))
        rouge_scores.extend([None] * len(group))
        bert_scores.extend([None] * len(group))

# Add scores to the DataFrame
df['BLEU'] = bleu_scores
df['ROUGE-L'] = rouge_scores
df['BERTScore'] = bert_scores

df

  _torch_pytree._register_pytree_node(
[nltk_data] Downloading package punkt to /Users/ayyub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  _torch_pytree._register_pytree_node(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.34 seconds, 0.75 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.65 seconds, 1.53 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.57 seconds, 1.76 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.73 seconds, 1.37 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.77 seconds, 1.30 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.66 seconds, 1.52 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.69 seconds, 1.44 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.74 seconds, 1.35 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.60 seconds, 1.66 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.74 seconds, 1.35 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.73 seconds, 1.36 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.64 seconds, 1.57 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.77 seconds, 1.30 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.73 seconds, 1.37 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.61 seconds, 1.64 sentences/sec


Unnamed: 0,Case,Summary,Reference,BERTScore
0,2011-10-27 Bias Comm124_Derek_Byrne_IAD_2015-0...,The document discusses the disciplinary charge...,1,
1,2011-10-27 Bias Comm124_Derek_Byrne_IAD_2015-0...,"On April 3, 2015, the San Francisco Police Com...",0,0.871588
2,2011-10-27 Bias Comm124_Derek_Byrne_IAD_2015-0...,Introduction:\nThis report presents the discip...,0,0.884274
3,2011-10-27 Bias Comm124_Derek_Byrne_IAD_2015-0...,The Police Commission of the City and County o...,0,0.842083
4,2017-07-31 Bias Comm118_part_1_Redacted.pdf,"On July 31, 2017, Sergeant Sherry Hicks, a swo...",0,0.8281
5,2017-07-31 Bias Comm118_part_1_Redacted.pdf,The document is a resolution from the San Fran...,1,
6,2017-07-31 Bias Comm118_part_1_Redacted.pdf,"On July 31, 2017, Sergeant Sherry Hicks, Star ...",0,0.845881
7,2017-07-31 Bias Comm118_part_1_Redacted.pdf,"On October 10, 2018, a meeting of the Police C...",0,0.833476
8,2017-07-31 Bias Comm118_part_2_Redacted.pdf,This document discusses the case of Sergeant S...,1,
9,2017-07-31 Bias Comm118_part_2_Redacted.pdf,"On August 9, 2018, the Police Commission initi...",0,0.843347


In [4]:
df = df[["Case", "Reference", "BERTScore"]]

df

Unnamed: 0,Case,Reference,BERTScore
0,2011-10-27 Bias Comm124_Derek_Byrne_IAD_2015-0...,1,
1,2011-10-27 Bias Comm124_Derek_Byrne_IAD_2015-0...,0,0.871588
2,2011-10-27 Bias Comm124_Derek_Byrne_IAD_2015-0...,0,0.884274
3,2011-10-27 Bias Comm124_Derek_Byrne_IAD_2015-0...,0,0.842083
4,2017-07-31 Bias Comm118_part_1_Redacted.pdf,0,0.8281
5,2017-07-31 Bias Comm118_part_1_Redacted.pdf,1,
6,2017-07-31 Bias Comm118_part_1_Redacted.pdf,0,0.845881
7,2017-07-31 Bias Comm118_part_1_Redacted.pdf,0,0.833476
8,2017-07-31 Bias Comm118_part_2_Redacted.pdf,1,
9,2017-07-31 Bias Comm118_part_2_Redacted.pdf,0,0.843347
