In [1]:
!pip install nltk



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os

# Function to calculate BLEU score for a pair of sentences
def calculate_bleu(reference, hypothesis):
    if pd.isna(reference) or pd.isna(hypothesis):
        return 0.0
    reference_tokens = reference.split()
    hypothesis_tokens = hypothesis.split()
    smoothing_function = SmoothingFunction().method1
    return sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)

# Function to process CSV datasets
def process_csv(file_path, columns, new_columns, output_directory, output_file_name, filtered_file_name):
    # Load dataset
    df = pd.read_csv(file_path)

    # Validate column names
    for ref_col, hyp_col in columns:
        if ref_col not in df.columns or hyp_col not in df.columns:
            raise KeyError(f"Missing columns: {ref_col} or {hyp_col} in {file_path}")

    # Calculate BLEU scores
    for i, (ref_col, hyp_col) in enumerate(columns):
        bleu_scores = []
        for _, row in df.iterrows():
            reference_text = row[ref_col]
            hypothesis_text = row[hyp_col]
            score = calculate_bleu(reference_text, hypothesis_text)
            bleu_scores.append(score)
        df[new_columns[i]] = bleu_scores

    # Save full dataset
    os.makedirs(output_directory, exist_ok=True)
    full_output_path = os.path.join(output_directory, output_file_name)
    df.to_csv(full_output_path, index=False)

    # Filter BLEU scores < 0.7
    filtered_df = df[df[new_columns].lt(0.7).any(axis=1)]
    filtered_output_path = os.path.join(output_directory, filtered_file_name)
    filtered_df.to_csv(filtered_output_path, index=False)

    print(f"Processed: {file_path}")
    print(f"Full dataset saved to: {full_output_path}")
    print(f"Filtered dataset saved to: {filtered_output_path}")

# Define datasets with IndE-specific file paths and columns
base_path = '/content/drive/MyDrive/!!Multi-AAVENUE/Translated Datasets/GPT 4o/IndE'

datasets = {
    "SVAMP": {
        "file_path": f"{base_path}/SVAMP(700)_IndE.csv",
        "columns": [("Original", "IndE (Original)")],
        "new_columns": ["BLEU Score Original"],
        "output_directory": "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/SVAMP",
        "output_file_name": "svamp_bleu_scores.csv",
        "filtered_file_name": "svamp_filtered_bleu_scores.csv",
    },
    "MBPP": {
        "file_path": f"{base_path}/MBPP(374)_IndE.csv",
        "columns": [("Original", "IndE (Original)")],
        "new_columns": ["BLEU Score Original"],
        "output_directory": "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/MBPP",
        "output_file_name": "mbpp_bleu_scores.csv",
        "filtered_file_name": "mbpp_filtered_bleu_scores.csv",
    },
    "HumanEVAL": {
        "file_path": f"{base_path}/HumanEVAL(164)_IndE.csv",
        "columns": [("Prompt", "IndE (Prompt)")],
        "new_columns": ["BLEU Score Prompt"],
        "output_directory": "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/HumanEVAL",
        "output_file_name": "humaneval_bleu_scores.csv",
        "filtered_file_name": "humaneval_filtered_bleu_scores.csv",
    },
    "GSM8K": {
        "file_path": f"{base_path}/GSM8K(1000)_IndE.csv",
        "columns": [("Original", "IndE (Original)")],
        "new_columns": ["BLEU Score Original"],
        "output_directory": "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/GSM8K",
        "output_file_name": "gsm8k_bleu_scores.csv",
        "filtered_file_name": "gsm8k_filtered_bleu_scores.csv",
    },
    "FOLIO": {
        "file_path": f"{base_path}/FOLIO(1000)_IndE.csv",
        "columns": [("Premises", "IndE (Premises)")],
        "new_columns": ["BLEU Score Premises"],
        "output_directory": "/content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/FOLIO",
        "output_file_name": "folio_bleu_scores.csv",
        "filtered_file_name": "folio_filtered_bleu_scores.csv",
    },
}

# Process all datasets
for dataset_name, info in datasets.items():
    process_csv(
        file_path=info["file_path"],
        columns=info["columns"],
        new_columns=info["new_columns"],
        output_directory=info["output_directory"],
        output_file_name=info["output_file_name"],
        filtered_file_name=info["filtered_file_name"]
    )

print("Processing completed.")


Processed: /content/drive/MyDrive/!!Multi-AAVENUE/Translated Datasets/GPT 4o/IndE/SVAMP(700)_IndE.csv
Full dataset saved to: /content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/SVAMP/svamp_bleu_scores.csv
Filtered dataset saved to: /content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/SVAMP/svamp_filtered_bleu_scores.csv
Processed: /content/drive/MyDrive/!!Multi-AAVENUE/Translated Datasets/GPT 4o/IndE/MBPP(374)_IndE.csv
Full dataset saved to: /content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/MBPP/mbpp_bleu_scores.csv
Filtered dataset saved to: /content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/MBPP/mbpp_filtered_bleu_scores.csv
Processed: /content/drive/MyDrive/!!Multi-AAVENUE/Translated Datasets/GPT 4o/IndE/HumanEVAL(164)_IndE.csv
Full dataset saved to: /content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/IndE/HumanEVAL/humaneval_bleu_scores.csv
Filtered dataset saved to: /content/drive/MyDrive/!!Multi-AAVENUE/BLEU Score Datasets/