In [None]:
!pip install nltk



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import os

# Function to calculate BLEU score for a pair of sentences
def calculate_bleu(reference, hypothesis):
    # Check if either reference or hypothesis is NaN
    if pd.isna(reference) or pd.isna(hypothesis):
        return 0.0

    # Tokenize the reference and hypothesis
    reference_tokens = reference.split()
    hypothesis_tokens = hypothesis.split()

    # Use smoothing function to handle cases where n-gram precision is zero
    smoothing_function = SmoothingFunction().method1

    # Calculate BLEU score
    score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
    return score

# Define a function to process each dataset and calculate BLEU scores
def process_and_calculate_bleu(file_path, columns, new_columns, output_directory, output_file_name):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Calculate BLEU scores and add them to new columns
    for i, (ref_col, hyp_col) in enumerate(columns):
        bleu_scores = []
        for index, row in df.iterrows():
            reference_text = row[ref_col]
            hypothesis_text = row[hyp_col]
            score = calculate_bleu(reference_text, hypothesis_text)
            bleu_scores.append(score)
        df[new_columns[i]] = bleu_scores

    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    # Define the output file path for CSV
    output_file_path = os.path.join(output_directory, output_file_name)
    df.to_csv(output_file_path, index=False)
    print(f"BLEU scores calculated and saved to {output_file_path}")

    # Calculate percentages for BLEU score thresholds and save to text file
    calculate_and_save_percentages(df, new_columns, output_directory)

def calculate_and_save_percentages(df, new_columns, output_directory):
    # Initialize a dictionary to store percentages for each threshold
    thresholds = [0.7, 0.5, 0.3]
    percentage_results = {col: {t: 0 for t in thresholds} for col in new_columns}

    # Calculate the percentage of scores under each threshold for each column
    for col in new_columns:
        total_count = len(df)
        for threshold in thresholds:
            count_below_threshold = (df[col] < threshold).sum()
            percentage_results[col][threshold] = (count_below_threshold / total_count) * 100

    # Define the output file path for text file
    output_txt_path = os.path.join(output_directory, 'percentages.txt')

    # Write the percentages to the text file
    with open(output_txt_path, 'w') as f:
        for col in new_columns:
            f.write(f"{col} BLEU Score Percentages:\n")
            for threshold in thresholds:
                percentage = percentage_results[col][threshold]
                f.write(f"  - Below {threshold:.1f}: {percentage:.2f}%\n")
            f.write("\n")

    print(f"Percentage scores calculated and saved to {output_txt_path}")

# Define the datasets and corresponding columns for BLEU score calculation
datasets = {
    'BoolQ': {
        'file_path': '/content/drive/MyDrive/Algoverse/New Results/GPT 4o mini Translations/Translated BoolQ.csv',
        'columns': [('SAE Passage', 'AAVE Passage'), ('SAE Question', 'AAVE Question')],
        'new_columns': ['BLEU Score Passage', 'BLEU Score Question'],
        'output_directory': '/content/drive/MyDrive/Algoverse/New Results/BLEU Scores/BoolQ',
        'output_file_name': 'boolq_bleu_scores.csv'
    },
    'COPA': {
        'file_path': '/content/drive/MyDrive/Algoverse/New Results/GPT 4o mini Translations/Translated Copa.csv',
        'columns': [
            ('Premise', 'Translated Premise'),
            ('Choice 1', 'Translated Choice 1'),
            ('Choice 2', 'Translated Choice 2')
        ],
        'new_columns': ['BLEU Score Premise', 'BLEU Score Choice 1', 'BLEU Score Choice 2'],
        'output_directory': '/content/drive/MyDrive/Algoverse/New Results/BLEU Scores/COPA',
        'output_file_name': 'copa_bleu_scores.csv'
    },
    'SST2': {
        'file_path': '/content/drive/MyDrive/Algoverse/New Results/GPT 4o mini Translations/Translated SST-2.csv',
        'columns': [('Original Sentence', 'Translated Sentence')],
        'new_columns': ['BLEU Score Sentence'],
        'output_directory': '/content/drive/MyDrive/Algoverse/New Results/BLEU Scores/SST-2',
        'output_file_name': 'sst2_bleu_scores.csv'
    },
    'WSC': {
        'file_path': '/content/drive/MyDrive/Algoverse/New Results/GPT 4o mini Translations/Translated WSC.csv',
        'columns': [('Original Paragraph', 'AAVE Paragraph')],
        'new_columns': ['BLEU Score Paragraph'],
        'output_directory': '/content/drive/MyDrive/Algoverse/New Results/BLEU Scores/WSC',
        'output_file_name': 'wsc_bleu_scores.csv'
    },
    'MultiRC': {
        'file_path': '/content/drive/MyDrive/Algoverse/New Results/GPT 4o mini Translations/Translated MultiRC.csv',
        'columns': [
            ('Paragraph', 'Translated Paragraph'),
            ('Question', 'Translated Question'),
            ('Answer Choice', 'Translated Answer Choice')
        ],
        'new_columns': ['BLEU Score Paragraph', 'BLEU Score Question', 'BLEU Score Answer Choice'],
        'output_directory': '/content/drive/MyDrive/Algoverse/New Results/BLEU Scores/MultiRC',
        'output_file_name': 'multirc_bleu_scores.csv'
    }
}

# Process each dataset and calculate BLEU scores and percentages
for dataset_name, info in datasets.items():
    process_and_calculate_bleu(info['file_path'], info['columns'], info['new_columns'], info['output_directory'], info['output_file_name'])

print("BLEU score calculations and percentage calculations completed for all datasets.")

BLEU scores calculated and saved to /content/drive/MyDrive/Algoverse/New Results/BLEU Scores/BoolQ/boolq_bleu_scores.csv
Percentage scores calculated and saved to /content/drive/MyDrive/Algoverse/New Results/BLEU Scores/BoolQ/percentages.txt
BLEU scores calculated and saved to /content/drive/MyDrive/Algoverse/New Results/BLEU Scores/COPA/copa_bleu_scores.csv
Percentage scores calculated and saved to /content/drive/MyDrive/Algoverse/New Results/BLEU Scores/COPA/percentages.txt
BLEU scores calculated and saved to /content/drive/MyDrive/Algoverse/New Results/BLEU Scores/SST-2/sst2_bleu_scores.csv
Percentage scores calculated and saved to /content/drive/MyDrive/Algoverse/New Results/BLEU Scores/SST-2/percentages.txt
BLEU scores calculated and saved to /content/drive/MyDrive/Algoverse/New Results/BLEU Scores/WSC/wsc_bleu_scores.csv
Percentage scores calculated and saved to /content/drive/MyDrive/Algoverse/New Results/BLEU Scores/WSC/percentages.txt
BLEU scores calculated and saved to /conte