In [1]:
!pip install pandas numpy sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [4]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer, util
from collections import Counter

# Evaluation functions
def lcs(X, Y):
    m, n = len(X), len(Y)
    L = [[0] * (n + 1) for _ in range(m + 1)]
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])
    return L[m][n]

def compute_rouge_l(candidate, reference):
    candidate = str(candidate) if not pd.isna(candidate) else ""
    reference = str(reference) if not pd.isna(reference) else ""
    candidate_words = candidate.split()
    reference_words = reference.split()
    lcs_length = lcs(candidate_words, reference_words)
    precision = lcs_length / len(candidate_words) if len(candidate_words) > 0 else 0
    recall = lcs_length / len(reference_words) if len(reference_words) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1_score

def compute_sts_score(text1, text2, model, device):
    text1 = str(text1) if not pd.isna(text1) else ""
    text2 = str(text2) if not pd.isna(text2) else ""

    embeddings1 = model.encode(text1, convert_to_tensor=True, device=device)
    embeddings2 = model.encode(text2, convert_to_tensor=True, device=device)

    similarity = util.pytorch_cos_sim(embeddings1, embeddings2)
    return similarity.item()

def compute_accuracy(text1, text2):
    text1 = str(text1) if not pd.isna(text1) else ""
    text2 = str(text2) if not pd.isna(text2) else ""
    return 1 if text1 == text2 else 0

def compute_soft_f1(candidate, reference):
    candidate = str(candidate) if not pd.isna(candidate) else ""
    reference = str(reference) if not pd.isna(reference) else ""
    candidate_words = candidate.split()
    reference_words = reference.split()
    candidate_counter = Counter(candidate_words)
    reference_counter = Counter(reference_words)
    overlap = sum((candidate_counter & reference_counter).values())
    precision = overlap / len(candidate_words) if len(candidate_words) > 0 else 0
    recall = overlap / len(reference_words) if len(reference_words) > 0 else 0
    soft_f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return soft_f1_score

# Main processing function
def process_csv(input_file, output_file):
    # Load the CSV file
    df = pd.read_csv(input_file)

    # Detect device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Initialize SentenceTransformer model on GPU
    model = SentenceTransformer('stsb-roberta-large', device=device)

    # Define task ranges and their corresponding evaluators
    tasks = [
        (2, 51, ['Rouge-L', 'STS']),
        (52, 101, ['Accuracy']),
        (102, 151, ['Rouge-L', 'STS']),
        (152, 201, ['Rouge-L', 'STS']),
        (202, 251, ['Rouge-L', 'STS']),
        (252, 301, ['Accuracy']),
        (302, 351, ['Rouge-L', 'STS']),
        (352, 401, ['Soft-F1']),
        (402, 451, ['Rouge-L', 'STS']),
        (452, 501, ['Accuracy']),
        (502, 551, ['Rouge-L', 'STS']),
        (552, 601, ['Soft-F1']),
        (602, 651, ['Rouge-L', 'STS']),
        (652, 701, ['Rouge-L', 'STS']),
        (702, 751, ['Rouge-L', 'STS']),
        (752, 801, ['Rouge-L', 'STS']),
        (802, 851, ['Rouge-L', 'STS']),
        (852, 901, ['Accuracy']),
        (902, 951, ['Accuracy']),
        (952, 1001, ['Rouge-L', 'STS'])
    ]

    # Initialize score columns
    for eval_type in ['Rouge-L', 'STS', 'Accuracy', 'Soft-F1']:
        df[eval_type] = np.nan

    # Process each task range
    for task_index, (start_row_index, end_row_index, evaluators) in enumerate(tasks, start=1):
        task_slice = df.iloc[start_row_index - 1:end_row_index]

        if 'Rouge-L' in evaluators:
            df.loc[start_row_index - 1:end_row_index - 1, 'Rouge-L'] = task_slice.apply(
                lambda row: compute_rouge_l(row['LLM Response'], row['Golden Answer:']), axis=1
            )

        if 'STS' in evaluators:
            df.loc[start_row_index - 1:end_row_index - 1, 'STS'] = task_slice.apply(
                lambda row: compute_sts_score(row['LLM Response'], row['Golden Answer:'], model, device), axis=1
            )

        if 'Accuracy' in evaluators:
            df.loc[start_row_index - 1:end_row_index - 1, 'Accuracy'] = task_slice.apply(
                lambda row: compute_accuracy(row['LLM Response'], row['Golden Answer:']), axis=1
            )

        if 'Soft-F1' in evaluators:
            df.loc[start_row_index - 1:end_row_index - 1, 'Soft-F1'] = task_slice.apply(
                lambda row: compute_soft_f1(row['LLM Response'], row['Golden Answer:']), axis=1
            )

        print(f"Task {task_index} completed: Rows {start_row_index} to {end_row_index} evaluated with {evaluators}")

    # Save the results to a new CSV file
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

# Usage
input_file = '/content/drive/MyDrive/GT/CS 7650 NLP/Deepseek Good Responses.csv'  # Update with your input file
output_file = '/content/drive/MyDrive/GT/CS 7650 NLP/Deepseek Good Responses Metric Evals.csv'  # Corrected output filename
process_csv(input_file, output_file)

Using device: cuda
Task 1 completed: Rows 2 to 51 evaluated with ['Rouge-L', 'STS']
Task 2 completed: Rows 52 to 101 evaluated with ['Accuracy']
Task 3 completed: Rows 102 to 151 evaluated with ['Rouge-L', 'STS']
Task 4 completed: Rows 152 to 201 evaluated with ['Rouge-L', 'STS']
Task 5 completed: Rows 202 to 251 evaluated with ['Rouge-L', 'STS']
Task 6 completed: Rows 252 to 301 evaluated with ['Accuracy']
Task 7 completed: Rows 302 to 351 evaluated with ['Rouge-L', 'STS']
Task 8 completed: Rows 352 to 401 evaluated with ['Soft-F1']
Task 9 completed: Rows 402 to 451 evaluated with ['Rouge-L', 'STS']
Task 10 completed: Rows 452 to 501 evaluated with ['Accuracy']
Task 11 completed: Rows 502 to 551 evaluated with ['Rouge-L', 'STS']
Task 12 completed: Rows 552 to 601 evaluated with ['Soft-F1']
Task 13 completed: Rows 602 to 651 evaluated with ['Rouge-L', 'STS']
Task 14 completed: Rows 652 to 701 evaluated with ['Rouge-L', 'STS']
Task 15 completed: Rows 702 to 751 evaluated with ['Rouge-L'