In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install --quiet nltk rouge-score transformers torch torchvision torchaudio pandas numpy tqdm protobuf==3.20.0

import os
import re
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
import torch
import torch.nn as nn
from transformers import BartTokenizer, BartForConditionalGeneration
from tqdm import tqdm
from rouge_score import rouge_scorer
import IPython

nltk.download('punkt')

def lexical_diversity(text):
    t = word_tokenize(text)
    return (len(set(t))/len(t)) if len(t) else 0

def process_lexical_diversity(p, c, o):
    df = pd.read_csv(p)
    scores = []
    for _, x in tqdm(df[c].items(), total=len(df), desc=""):
        scores.append(lexical_diversity(x) if isinstance(x, str) else 0)
    new_col = f"LexicalDiversity ({c})"
    df[new_col] = scores
    os.makedirs(o, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(p))[0]
    out_csv = os.path.join(o, f"{base_name}_lexical_diversity.csv")
    out_txt = os.path.join(o, f"{base_name}_average_lexical_diversity.txt")
    df.to_csv(out_csv, index=False)
    avg_score = np.mean(scores) if len(scores) else 0
    with open(out_txt, "w") as f:
        f.write(f"Average Lexical Diversity: {avg_score}\n")

class BARTScorer:
    def __init__(self, d='cuda', m=1024, c='facebook/bart-large-cnn'):
        self.d = d
        self.m = m
        self.tok = BartTokenizer.from_pretrained(c)
        self.mod = BartForConditionalGeneration.from_pretrained(c)
        self.mod.eval().to(d)
        self.lf = nn.NLLLoss(reduction='none', ignore_index=self.mod.config.pad_token_id)
        self.ls = nn.LogSoftmax(dim=1)

    def score(self, srcs, tgts, b=4):
        out_scores = []
        for i in range(0, len(srcs), b):
            batch_src = srcs[i:i+b]
            batch_tgt = tgts[i:i+b]
            with torch.no_grad():
                enc_s = self.tok(batch_src, max_length=self.m, truncation=True, padding=True, return_tensors='pt')
                enc_t = self.tok(batch_tgt, max_length=self.m, truncation=True, padding=True, return_tensors='pt')
                si = enc_s['input_ids'].to(self.d)
                sm = enc_s['attention_mask'].to(self.d)
                ti = enc_t['input_ids'].to(self.d)
                tm = enc_t['attention_mask'].to(self.d)
                tl = tm.sum(dim=1)
                out = self.mod(input_ids=si, attention_mask=sm, labels=ti)
                logits = out.logits.view(-1, self.mod.config.vocab_size)
                x = self.lf(self.ls(logits), ti.view(-1)).view(ti.shape[0], -1).sum(dim=1) / tl
                for v in x:
                    out_scores.append(-v.item())
        return out_scores

bsc = BARTScorer('cuda' if torch.cuda.is_available() else 'cpu')

def process_bartscore(p, c, o):
    df = pd.read_csv(p)
    if 'Original' not in df.columns or c not in df.columns:
        return
    df = df.dropna(subset=['Original', c])
    refs = df['Original'].astype(str).tolist()
    hyps = df[c].astype(str).tolist()
    scores = []
    for rr, hh in tqdm(zip(refs, hyps), total=len(refs), desc=""):
        val = bsc.score([rr], [hh], 1)[0]
        scores.append(val)
    new_col = f"BARTScore ({c})"
    df[new_col] = scores
    os.makedirs(o, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(p))[0]
    out_csv = os.path.join(o, f"{base_name}_bartscore.csv")
    out_txt = os.path.join(o, f"{base_name}_bartscore.txt")
    df.to_csv(out_csv, index=False)
    avg_score = np.mean(scores) if len(scores) else float('nan')
    with open(out_txt, "w") as f:
        f.write(f"Average BARTScore: {avg_score}\n")

def single_rouge(r, h):
    if not r or not h or pd.isna(r) or pd.isna(h):
        return 0
    sc = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True).score(r, h)
    return (sc["rouge1"].recall + sc["rouge2"].recall + sc["rougeL"].recall) / 3

def process_rouge(p, c, o):
    df = pd.read_csv(p)
    if 'Original' not in df.columns or c not in df.columns:
        return
    scores = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc=""):
        scores.append(single_rouge(row['Original'], row[c]))
    new_col = f"ROUGE ({c})"
    df[new_col] = scores
    os.makedirs(o, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(p))[0]
    out_csv = os.path.join(o, f"{base_name}_rouge.csv")
    out_txt = os.path.join(o, f"{base_name}_average_rouge.txt")
    df.to_csv(out_csv, index=False)
    avg_score = np.mean(scores) if len(scores) else 0
    with open(out_txt, "w") as f:
        f.write(f"Average ROUGE: {avg_score:.4f}\n")

def main():
    dialects = ["AAVE", "IndE", "JamE", "CollSgE"]
    main_files = [
        "aligned_svamp700.csv","aligned_mbpp374.csv","aligned_logic_bench_yn500.csv",
        "aligned_logic_bench_mcq480.csv","aligned_humaneval164.csv","aligned_gsm8k1000.csv",
        "aligned_folio1000.csv"
    ]
    glue_files = [
        "aligned_wsc_659.csv","aligned_sst-2_1000.csv","aligned_multirc_1000.csv",
        "aligned_copa_500.csv","aligned_boolq_1000.csv"
    ]
    base_root = "/content/drive/MyDrive/!!Multi-AAVENUE/Aligned Translations"
    base_metrics = "/content/drive/MyDrive/!!Multi-AAVENUE/Metrics"
    modes = {"GPT 4o":"Filtered GPT 4o","Multi-VALUE":"Filtered Multi-VALUE"}

    def ds_name(f):
        if f.startswith("aligned_"):
            f=f[8:]
        if f.endswith(".csv"):
            f=f[:-4]
        f=re.sub(r'(\_\d+|\d+)$','',f)
        return f

    for dialect in dialects:
        files = [os.path.join(base_root, dialect, x) for x in main_files]
        files += [os.path.join(base_root, dialect, "GLUE + SuperGLUE", x) for x in glue_files]
        for cpath in files:
            dataset = ds_name(os.path.basename(cpath))
            for mode_label, col_name in modes.items():
                out_bart = os.path.join(base_metrics, "BARTScore", mode_label, dialect, dataset)
                out_lex = os.path.join(base_metrics, "Lexical Diversity", mode_label, dialect, dataset)
                out_rg = os.path.join(base_metrics, "ROUGE Score", mode_label, dialect, dataset)
                try:
                    process_bartscore(cpath, col_name, out_bart)
                except:
                    pass
                try:
                    process_lexical_diversity(cpath, col_name, out_lex)
                except:
                    pass
                try:
                    process_rouge(cpath, col_name, out_rg)
                except:
                    pass

    IPython.get_ipython().kernel.do_shutdown(True)

if __name__=="__main__":
    main()

In [None]:
## Code for lexical diversity since the code above does that wrong

from nltk.tokenize import word_tokenize
from nltk import download
import pandas as pd
import os
import numpy as np
from tqdm import tqdm

download('punkt_tab')

def lexical_diversity(text):
    tokens = word_tokenize(text)
    total_words = len(tokens)
    unique_words = len(set(tokens))
    return (unique_words / total_words) if total_words else 0

def process_lexical_diversity(file_path, column_name, output_dir):
    df = pd.read_csv(file_path)
    scores = []

    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {column_name}"):
        text = row[column_name] if isinstance(row[column_name], str) else ""
        scores.append(lexical_diversity(text))

    new_col_name = f"Lexical Diversity ({column_name})"
    df[new_col_name] = scores

    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    output_csv = os.path.join(output_dir, f"{base_name}_lexical_diversity.csv")
    output_txt = os.path.join(output_dir, f"{base_name}_average_lexical_diversity.txt")

    df.to_csv(output_csv, index=False)

    avg_score = np.mean(scores) if scores else 0
    with open(output_txt, "w") as f:
        f.write(f"Average Lexical Diversity: {avg_score:.4f}\n")

    print(f"Saved results for {file_path} to {output_dir}")

def main():
    dialects = ["AAVE", "IndE", "JamE", "CollSgE"]
    main_files = [
        "aligned_svamp700.csv", "aligned_mbpp374.csv", "aligned_logic_bench_yn500.csv",
        "aligned_logic_bench_mcq480.csv", "aligned_humaneval164.csv", "aligned_gsm8k1000.csv",
        "aligned_folio1000.csv"
    ]
    glue_files = [
        "aligned_wsc_659.csv", "aligned_sst-2_1000.csv", "aligned_multirc_1000.csv",
        "aligned_copa_500.csv", "aligned_boolq_1000.csv"
    ]

    base_path = "/content/drive/MyDrive/!!Multi-AAVENUE/Aligned Translations"
    output_base = "/content/drive/MyDrive/!!Multi-AAVENUE/Metrics/Lexical Diversity"

    modes = {
        "GPT 4o": "Filtered GPT 4o",
        "Multi-VALUE": "Filtered Multi-VALUE"
    }

    for dialect in dialects:
        files = main_files + [os.path.join("GLUE + SuperGLUE", file) for file in glue_files]

        for file_name in files:
            file_path = os.path.join(base_path, dialect, file_name)
            dataset_name = file_name.split("_")[1].split(".")[0] if "_" in file_name else file_name.split(".")[0]

            for mode, column_name in modes.items():
                output_dir = os.path.join(output_base, mode, dialect, dataset_name)

                try:
                    process_lexical_diversity(file_path, column_name, output_dir)
                except Exception as e:
                    print(f"Error processing {file_path} for {mode}: {e}")

if __name__ == "__main__":
    main()