In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Install necessary packages
!pip install torch torchvision torchaudio
!pip install transformers
!pip install pandas
!pip install numpy

Mounted at /content/drive
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_c

In [2]:
# Calculate BoolQ BARTScore

import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import torch.nn as nn
import numpy as np
from typing import List

class BARTScorer:
    def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

# Initialize BARTScorer
bart_scorer = BARTScorer(device='cuda', checkpoint='facebook/bart-large-cnn')

# Function to calculate scores
def calculate_scores(file_path, orig_col, aave_col, value_col, output_file):
    # Load CSV file
    data = pd.read_csv(file_path)

    # Extract relevant columns
    originals = data[orig_col].tolist()
    aave_texts = data[aave_col].tolist()
    value_texts = data[value_col].tolist()

    # Lists to store individual scores
    aave_scores = []
    value_scores = []

    # Calculate BARTScore for each row and print progress
    for idx, (orig, aave, value) in enumerate(zip(originals, aave_texts, value_texts)):
        aave_score = bart_scorer.score([orig], [aave], batch_size=1)[0]
        value_score = bart_scorer.score([orig], [value], batch_size=1)[0]

        aave_scores.append(aave_score)
        value_scores.append(value_score)

        print(f"Processed 1 row AAVE ({aave_score})")
        print(f"Processed 1 row VALUE ({value_score})")

    # Calculate average scores
    average_aave_score = np.mean(aave_scores)
    average_value_score = np.mean(value_scores)

    # Save results to a text file
    with open(output_file, 'w') as f:
        f.write(f'Average BARTScore for AAVE - {average_aave_score}\n')
        f.write(f'Average BARTScore for VALUE - {average_value_score}\n')

    print(f"Scores calculated and saved to {output_file}.")

# Calculate scores for passages
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/BoolQ/VALUE BoolQ Passages.csv',
    orig_col='Original Passage',
    aave_col='AAVE Passage',
    value_col='VALUE Passage',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/BoolQ/BoolQ Passage BARTScores.txt'
)

# Calculate scores for questions
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/BoolQ/VALUE BoolQ Questions.csv',
    orig_col='Original Question',
    aave_col='AAVE Question',
    value_col='VALUE Question',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/BoolQ/BoolQ Questions BARTScore.txt'
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Processed 1 row AAVE (-2.85600209236145)
Processed 1 row VALUE (-2.167194128036499)
Processed 1 row AAVE (-1.3561393022537231)
Processed 1 row VALUE (-1.8297725915908813)
Processed 1 row AAVE (-1.2961024045944214)
Processed 1 row VALUE (-1.6753343343734741)
Processed 1 row AAVE (-0.9106414914131165)
Processed 1 row VALUE (-1.8315473794937134)
Processed 1 row AAVE (-1.882464051246643)
Processed 1 row VALUE (-1.004600167274475)
Processed 1 row AAVE (-0.772283136844635)
Processed 1 row VALUE (-1.1539973020553589)
Processed 1 row AAVE (-1.9952627420425415)
Processed 1 row VALUE (-0.945895254611969)
Processed 1 row AAVE (-1.5907400846481323)
Processed 1 row VALUE (-1.0941766500473022)
Processed 1 row AAVE (-0.996207058429718)
Processed 1 row VALUE (-2.0195372104644775)
Processed 1 row AAVE (-0.9260028600692749)
Processed 1 row VALUE (-0.9531416893005371)
Processed 1 row AAVE (-1.8296102285385132)
Processed 1 row VALUE (-1.2267353534698486)
Processed 1 row AAVE (-1.379623532295227)
Processed

In [3]:
# Calculate COPA BARTScore

import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import torch.nn as nn
import numpy as np
from typing import List

class BARTScorer:
    def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

# Initialize BARTScorer
bart_scorer = BARTScorer(device='cuda', checkpoint='facebook/bart-large-cnn')

# Function to calculate scores
def calculate_scores(file_path, orig_col, aave_col, value_col, output_file):
    # Load CSV file
    data = pd.read_csv(file_path)

    # Extract relevant columns
    originals = data[orig_col].tolist()
    aave_texts = data[aave_col].tolist()
    value_texts = data[value_col].tolist()

    # Lists to store individual scores
    aave_scores = []
    value_scores = []

    # Calculate BARTScore for each row and print progress
    for idx, (orig, aave, value) in enumerate(zip(originals, aave_texts, value_texts)):
        aave_score = bart_scorer.score([orig], [aave], batch_size=1)[0]
        value_score = bart_scorer.score([orig], [value], batch_size=1)[0]

        aave_scores.append(aave_score)
        value_scores.append(value_score)

        print(f"Processed row {idx + 1} AAVE ({aave_score})")
        print(f"Processed row {idx + 1} VALUE ({value_score})")

    # Calculate average scores
    average_aave_score = np.mean(aave_scores)
    average_value_score = np.mean(value_scores)

    # Save results to a text file
    with open(output_file, 'w') as f:
        f.write(f'Average BARTScore for AAVE - {average_aave_score}\n')
        f.write(f'Average BARTScore for VALUE - {average_value_score}\n')

    print(f"Scores calculated and saved to {output_file}.")

# Calculate scores for COPA Choice 1
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/COPA/VALUE COPA Choice 1.csv',
    orig_col='Original Choice 1',
    aave_col='AAVE Choice 1',
    value_col='VALUE Choice 1',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/COPA/COPA Choice 1 BARTScores.txt'
)

# Calculate scores for COPA Choice 2
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/COPA/VALUE COPA Choice 2.csv',
    orig_col='Original Choice 2',
    aave_col='AAVE Choice 2',
    value_col='VALUE Choice 2',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/COPA/COPA Choice 2 BARTScores.txt'
)

# Calculate scores for COPA Premise
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/COPA/VALUE COPA Premise.csv',
    orig_col='Original Premise',
    aave_col='AAVE Premise',
    value_col='VALUE Premise',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/COPA/COPA Premise BARTScores.txt'
)

Processed row 1 AAVE (-2.199928045272827)
Processed row 1 VALUE (-2.7404425144195557)
Processed row 2 AAVE (-0.49116766452789307)
Processed row 2 VALUE (-3.3421857357025146)
Processed row 3 AAVE (-1.369706153869629)
Processed row 3 VALUE (-4.724113464355469)
Processed row 4 AAVE (-1.0999774932861328)
Processed row 4 VALUE (-1.0902539491653442)
Processed row 5 AAVE (-2.793205499649048)
Processed row 5 VALUE (-2.712872266769409)
Processed row 6 AAVE (-0.5299605131149292)
Processed row 6 VALUE (-2.680142402648926)
Processed row 7 AAVE (-3.7252204418182373)
Processed row 7 VALUE (-2.736171007156372)
Processed row 8 AAVE (-0.41634872555732727)
Processed row 8 VALUE (-2.044572114944458)
Processed row 9 AAVE (-0.3562026619911194)
Processed row 9 VALUE (-3.651721239089966)
Processed row 10 AAVE (-0.9357743263244629)
Processed row 10 VALUE (-2.7153162956237793)
Processed row 11 AAVE (-1.1978027820587158)
Processed row 11 VALUE (-3.951671838760376)
Processed row 12 AAVE (-2.3226068019866943)
Pro

In [4]:
# Calculate CoLa BARTScore

import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import torch.nn as nn
import numpy as np
from typing import List

class BARTScorer:
    def __init__(self, device='cuda', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

# Initialize BARTScorer
bart_scorer = BARTScorer(device='cuda', checkpoint='facebook/bart-large-cnn')

# Function to calculate scores
def calculate_scores(file_path, orig_col, aave_col, value_col, output_file):
    # Load CSV file
    data = pd.read_csv(file_path)

    # Extract relevant columns
    originals = data[orig_col].tolist()
    aave_texts = data[aave_col].tolist()
    value_texts = data[value_col].tolist()

    # Lists to store individual scores
    aave_scores = []
    value_scores = []

    # Calculate BARTScore for each row and print progress
    for idx, (orig, aave, value) in enumerate(zip(originals, aave_texts, value_texts)):
        aave_score = bart_scorer.score([orig], [aave], batch_size=1)[0]
        value_score = bart_scorer.score([orig], [value], batch_size=1)[0]

        aave_scores.append(aave_score)
        value_scores.append(value_score)

        print(f"Processed row {idx + 1} AAVE ({aave_score})")
        print(f"Processed row {idx + 1} VALUE ({value_score})")

    # Calculate average scores
    average_aave_score = np.mean(aave_scores)
    average_value_score = np.mean(value_scores)

    # Save results to a text file
    with open(output_file, 'w') as f:
        f.write(f'Average BARTScore for AAVE - {average_aave_score}\n')
        f.write(f'Average BARTScore for VALUE - {average_value_score}\n')

    print(f"Scores calculated and saved to {output_file}.")

# Calculate scores for CoLA Sentences
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/CoLa/VALUE CoLA Sentences.csv',
    orig_col='Original Sentence',
    aave_col='AAVE Sentence',
    value_col='VALUE Sentence',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/CoLa/CoLA Sentences BARTScore.txt'
)

Processed row 1 AAVE (-3.264983654022217)
Processed row 1 VALUE (-2.140676736831665)
Processed row 2 AAVE (-1.787852168083191)
Processed row 2 VALUE (-1.9354026317596436)
Processed row 3 AAVE (-1.72491455078125)
Processed row 3 VALUE (-1.49046790599823)
Processed row 4 AAVE (-1.993460774421692)
Processed row 4 VALUE (-5.168051719665527)
Processed row 5 AAVE (-1.8621495962142944)
Processed row 5 VALUE (-1.2571572065353394)
Processed row 6 AAVE (-2.4705440998077393)
Processed row 6 VALUE (-3.6694610118865967)
Processed row 7 AAVE (-2.1518948078155518)
Processed row 7 VALUE (-2.479916572570801)
Processed row 8 AAVE (-2.0987601280212402)
Processed row 8 VALUE (-3.3421988487243652)
Processed row 9 AAVE (-0.33924365043640137)
Processed row 9 VALUE (-2.6478207111358643)
Processed row 10 AAVE (-2.011014938354492)
Processed row 10 VALUE (-3.1627233028411865)
Processed row 11 AAVE (-2.6099493503570557)
Processed row 11 VALUE (-1.3162412643432617)
Processed row 12 AAVE (-2.9712975025177)
Processe

In [None]:
# Calculate BARTScore for SST-2

import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import torch.nn as nn
import numpy as np
from typing import List

class BARTScorer:
    def __init__(self, device='cuda', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

# Initialize BARTScorer
bart_scorer = BARTScorer(device='cuda', checkpoint='facebook/bart-large-cnn')

# Function to calculate scores
def calculate_scores(file_path, orig_col, aave_col, value_col, output_file):
    # Load CSV file
    data = pd.read_csv(file_path)

    # Extract relevant columns
    originals = data[orig_col].tolist()
    aave_texts = data[aave_col].tolist()
    value_texts = data[value_col].tolist()

    # Lists to store individual scores
    aave_scores = []
    value_scores = []

    # Calculate BARTScore for each row and print progress
    for idx, (orig, aave, value) in enumerate(zip(originals, aave_texts, value_texts)):
        aave_score = bart_scorer.score([orig], [aave], batch_size=1)[0]
        value_score = bart_scorer.score([orig], [value], batch_size=1)[0]

        aave_scores.append(aave_score)
        value_scores.append(value_score)

        print(f"Processed row {idx + 1} AAVE ({aave_score})")
        print(f"Processed row {idx + 1} VALUE ({value_score})")

    # Calculate average scores
    average_aave_score = np.mean(aave_scores)
    average_value_score = np.mean(value_scores)

    # Save results to a text file
    with open(output_file, 'w') as f:
        f.write(f'Average BARTScore for AAVE - {average_aave_score}\n')
        f.write(f'Average BARTScore for VALUE - {average_value_score}\n')

    print(f"Scores calculated and saved to {output_file}.")

# Calculate scores for SST-2 Sentences
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/SST-2/VALUE SST-2 Sentences.csv',
    orig_col='Original Sentence',
    aave_col='AAVE Sentence',
    value_col='VALUE Sentence',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/SST-2/SST-2 Sentences BARTScore.txt'
)

In [6]:
# Calculate BARTScore for MultiRC

import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import torch.nn as nn
import numpy as np
from typing import List

class BARTScorer:
    def __init__(self, device='cuda', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

# Initialize BARTScorer
bart_scorer = BARTScorer(device='cuda', checkpoint='facebook/bart-large-cnn')

# Function to calculate scores
def calculate_scores(file_path, orig_col, aave_col, value_col, output_file):
    # Load CSV file
    data = pd.read_csv(file_path)

    # Extract relevant columns
    originals = data[orig_col].tolist()
    aave_texts = data[aave_col].tolist()
    value_texts = data[value_col].tolist()

    # Lists to store individual scores
    aave_scores = []
    value_scores = []

    # Calculate BARTScore for each row and print progress
    for idx, (orig, aave, value) in enumerate(zip(originals, aave_texts, value_texts)):
        aave_score = bart_scorer.score([orig], [aave], batch_size=1)[0]
        value_score = bart_scorer.score([orig], [value], batch_size=1)[0]

        aave_scores.append(aave_score)
        value_scores.append(value_score)

        print(f"Processed row {idx + 1} AAVE ({aave_score})")
        print(f"Processed row {idx + 1} VALUE ({value_score})")

    # Calculate average scores
    average_aave_score = np.mean(aave_scores)
    average_value_score = np.mean(value_scores)

    # Save results to a text file
    with open(output_file, 'w') as f:
        f.write(f'Average BARTScore for AAVE - {average_aave_score}\n')
        f.write(f'Average BARTScore for VALUE - {average_value_score}\n')

    print(f"Scores calculated and saved to {output_file}.")

# Calculate scores for MultiRC Paragraph
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/MultiRC/VALUE MultiRC Paragraph.csv',
    orig_col='Original Paragraph',
    aave_col='AAVE Paragraph',
    value_col='VALUE Paragraph',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/MultiRC/MultiRC Paragraph BARTScore.txt'
)

# Calculate scores for MultiRC Question
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/MultiRC/VALUE MultiRC Question.csv',
    orig_col='Original Question',
    aave_col='AAVE Question',
    value_col='VALUE Question',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/MultiRC/MultiRC Question BARTScore.txt'
)

Processed row 1 AAVE (-1.5288745164871216)
Processed row 1 VALUE (-1.8607029914855957)
Processed row 2 AAVE (-1.2540303468704224)
Processed row 2 VALUE (-2.3254199028015137)
Processed row 3 AAVE (-1.328806757926941)
Processed row 3 VALUE (-1.5158790349960327)
Processed row 4 AAVE (-2.1360535621643066)
Processed row 4 VALUE (-1.7295653820037842)
Processed row 5 AAVE (-1.7731480598449707)
Processed row 5 VALUE (-1.7956572771072388)
Processed row 6 AAVE (-1.708693504333496)
Processed row 6 VALUE (-1.3175506591796875)
Processed row 7 AAVE (-3.4496231079101562)
Processed row 7 VALUE (-1.7944916486740112)
Scores calculated and saved to /content/drive/MyDrive/Algoverse/New Results/BARTScores/MultiRC/MultiRC Paragraph BARTScore.txt.
Processed row 1 AAVE (-1.0562183856964111)
Processed row 1 VALUE (-2.2469518184661865)
Processed row 2 AAVE (-2.0235087871551514)
Processed row 2 VALUE (-2.757584810256958)
Processed row 3 AAVE (-1.5039598941802979)
Processed row 3 VALUE (-1.5725202560424805)
Proce

In [7]:
# Calculate BARTScore for WSC

import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import torch
import torch.nn as nn
import numpy as np
from typing import List

class BARTScorer:
    def __init__(self, device='cuda', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

# Initialize BARTScorer
bart_scorer = BARTScorer(device='cuda', checkpoint='facebook/bart-large-cnn')

# Function to calculate scores
def calculate_scores(file_path, orig_col, aave_col, value_col, output_file):
    # Load CSV file
    data = pd.read_csv(file_path)

    # Extract relevant columns
    originals = data[orig_col].tolist()
    aave_texts = data[aave_col].tolist()
    value_texts = data[value_col].tolist()

    # Lists to store individual scores
    aave_scores = []
    value_scores = []

    # Calculate BARTScore for each row and print progress
    for idx, (orig, aave, value) in enumerate(zip(originals, aave_texts, value_texts)):
        aave_score = bart_scorer.score([orig], [aave], batch_size=1)[0]
        value_score = bart_scorer.score([orig], [value], batch_size=1)[0]

        aave_scores.append(aave_score)
        value_scores.append(value_score)

        print(f"Processed row {idx + 1} AAVE ({aave_score})")
        print(f"Processed row {idx + 1} VALUE ({value_score})")

    # Calculate average scores
    average_aave_score = np.mean(aave_scores)
    average_value_score = np.mean(value_scores)

    # Save results to a text file
    with open(output_file, 'w') as f:
        f.write(f'Average BARTScore for AAVE - {average_aave_score}\n')
        f.write(f'Average BARTScore for VALUE - {average_value_score}\n')

    print(f"Scores calculated and saved to {output_file}.")

# Calculate scores for WSC Paragraphs
calculate_scores(
    file_path='/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/WSC/Filtered WSC Paragraphs.csv',
    orig_col='Original Paragraph',
    aave_col='AAVE Paragraph',
    value_col='VALUE Paragraph',
    output_file='/content/drive/MyDrive/Algoverse/New Results/BARTScores/WSC/WSC Paragraph BARTScore.txt'
)

Processed row 1 AAVE (-2.3062005043029785)
Processed row 1 VALUE (-2.343722343444824)
Processed row 2 AAVE (-2.601876735687256)
Processed row 2 VALUE (-1.782159447669983)
Processed row 3 AAVE (-1.996494174003601)
Processed row 3 VALUE (-3.6474337577819824)
Processed row 4 AAVE (-1.5172765254974365)
Processed row 4 VALUE (-2.508758068084717)
Processed row 5 AAVE (-2.1244678497314453)
Processed row 5 VALUE (-3.5550537109375)
Processed row 6 AAVE (-2.8356475830078125)
Processed row 6 VALUE (-3.3382790088653564)
Processed row 7 AAVE (-3.2370338439941406)
Processed row 7 VALUE (-1.6622427701950073)
Processed row 8 AAVE (-2.154174327850342)
Processed row 8 VALUE (-2.6848835945129395)
Processed row 9 AAVE (-2.2396440505981445)
Processed row 9 VALUE (-4.5162506103515625)
Processed row 10 AAVE (-1.2401682138442993)
Processed row 10 VALUE (-1.9747599363327026)
Processed row 11 AAVE (-1.922599196434021)
Processed row 11 VALUE (-2.69136381149292)
Processed row 12 AAVE (-1.5904383659362793)
Process