In [None]:
!pip install openai==0.28
!pip install pandas
!pip install google-generativeai

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Cell 1: Perform Translation Comparisons for GPT-4o, GPT-4o Mini, GPT-4 Turbo, Gemini 1.5 Flash, and Gemini 1.5 Pro

import pandas as pd
import openai
import os
import time
import re
import google.generativeai as genai

# Set up your API keys
openai.api_key = ''  # OpenAI API key for GPT models
GOOGLE_API_KEY = ''  # Google API key for Gemini models
genai.configure(api_key=GOOGLE_API_KEY)

# Function to compare translations using a specified GPT model with timeouts and retries
def compare_translations_gpt(model_name, text1, text2):
    try:
        response = openai.ChatCompletion.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are an expert in AAVE (African American Vernacular English). Compare the following two translations and indicate which one is more accurate or if they are about the same."},
                {"role": "user", "content": f"Translation 1: {text1}\nTranslation 2: {text2}\nWhich translation is more accurate in reflecting AAVE? Respond with 'Translation 1', 'Translation 2', or 'About the same'. You must choose one, don't say neither. Do not provide any additional text or explanation."}
            ],
            timeout=10  # Set a timeout for the request
        )
        return response['choices'][0]['message']['content'].strip()
    except openai.error.OpenAIError as e:
        print(f"API error during comparison: {e}")
        return "Comparison failed"
    except Exception as e:
        print(f"Unexpected error: {e}")
        return "Comparison failed"

# Function to compare translations using Gemini models
def compare_translations_gemini(model_name, text1, text2):
    model = genai.GenerativeModel(model_name)
    prompt = (
        "You are an expert in AAVE (African American Vernacular English). Compare the following two translations and indicate which one is more accurate or if they are about the same."
        f"\n\nTranslation 1: {text1}\nTranslation 2: {text2}\n"
        "Which translation is more accurate in reflecting AAVE? Respond with 'Translation 1', 'Translation 2', or 'About the same'. You must choose one, don't say neither. Do not provide any additional text or explanation."
    )

    try:
        response = model.generate_content(prompt)
        # Extract the response text
        response_text = response.candidates[0].content.parts[0].text.strip()

        # Use regex to find the first occurrence of the expected response
        match = re.search(r'(Translation 1|Translation 2|About the same)', response_text)
        if match:
            return match.group()
        else:
            print(f"Unexpected response for translations: '{response_text}'")
            return "Comparison failed"
    except Exception as e:
        print(f"Error processing translations: {e}")
        return "Comparison failed"

# Function to process GPT datasets and save comparison results
def process_gpt_dataset(model_name, main_directory, dataset_name, translation_files):
    print(f"Processing {dataset_name} dataset with {model_name}")

    for file_index, (file_path, value_col, aave_col) in enumerate(translation_files):
        # Load translations
        try:
            translations = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading file {file_path}: {e}")
            continue

        # Compare translations with retries
        comparison_results = []
        for index, row in translations.iterrows():
            retry_count = 0
            while retry_count < 3:
                result = compare_translations_gpt(model_name, row[aave_col], row[value_col])
                if result != "Comparison failed":
                    comparison_results.append(result)
                    translation_result = result.replace('Translation 1', 'AAVE').replace('Translation 2', 'VALUE')
                    print(f"Processed row {index + 1}: {translation_result}")
                    break
                else:
                    retry_count += 1
                    time.sleep(1)  # Wait a second before retrying
            if retry_count == 3:
                print(f"Failed to process row {index + 1} after 3 attempts")
                comparison_results.append("Comparison failed")

        # Create a DataFrame to store only the valid comparison results
        valid_translations = translations.iloc[:len(comparison_results)]

        # Store results in a new column in the DataFrame
        column_name = f"{value_col} Comparison Result"
        valid_translations[column_name] = comparison_results

        # Ensure the main directory and dataset subdirectory exist
        dataset_directory = os.path.join(main_directory, dataset_name)
        os.makedirs(dataset_directory, exist_ok=True)

        # Save the results to a separate CSV file for each translation file
        output_file_name = f'{dataset_name}_comparison_results_{file_index}.csv'
        output_file_path = os.path.join(dataset_directory, output_file_name)
        valid_translations.to_csv(output_file_path, index=False)
        print(f"{dataset_name} results for file {file_index + 1} saved to CSV file at {output_file_path}")

# Function to process Gemini datasets and save comparison results
def process_gemini_dataset(model_name, main_directory, dataset_name, translation_files):
    print(f"Processing {dataset_name} dataset with {model_name}")

    for file_index, (file_path, value_col, aave_col) in enumerate(translation_files):
        # Load translations
        try:
            translations = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading file {file_path}: {e}")
            continue

        # Compare translations
        comparison_results = []
        for index, row in translations.iterrows():
            result = compare_translations_gemini(model_name, row[aave_col], row[value_col])
            if result != "Comparison failed":
                comparison_results.append(result)
                translation_result = result.replace('Translation 1', 'AAVE').replace('Translation 2', 'VALUE')
                print(f"Processed row {index + 1}: {translation_result}")
            else:
                comparison_results.append("Comparison failed")

        # Create a DataFrame to store only the valid comparison results
        valid_translations = translations.iloc[:len(comparison_results)]

        # Store results in a new column in the DataFrame
        column_name = f"{value_col} Comparison Result"
        valid_translations[column_name] = comparison_results

        # Ensure the main directory and dataset subdirectory exist
        dataset_directory = os.path.join(main_directory, dataset_name)
        os.makedirs(dataset_directory, exist_ok=True)

        # Save the results to a separate CSV file for each translation file
        output_file_name = f'{dataset_name}_comparison_results_{file_index}.csv'
        output_file_path = os.path.join(dataset_directory, output_file_name)
        valid_translations.to_csv(output_file_path, index=False)
        print(f"{dataset_name} results for file {file_index + 1} saved to CSV file at {output_file_path}")

# Dataset information
datasets = {
    'BoolQ': {
        'translation_files': [
            ('/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/BoolQ/aligned_passages.csv', 'VALUE Passage', 'Translated Passage'),
            ('/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/BoolQ/aligned_questions.csv', 'VALUE Question', 'Translated Question')
        ]
    },
    'COPA': {
        'translation_files': [
            ('/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/COPA/aligned_choices.csv', 'VALUE Choice 1', 'AAVE Choice 1'),
            ('/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/COPA/aligned_choices_2.csv', 'VALUE Choice 2', 'AAVE Choice 2'),
            ('/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/COPA/aligned_premises.csv', 'VALUE Premise', 'Translated Premise')
        ]
    },
    'MultiRC': {
        'translation_files': [
            ('/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/MultiRC/aligned_paragraphs.csv', 'VALUE Paragraph', 'AAVE Paragraph'),
            ('/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/MultiRC/aligned_questions.csv', 'VALUE Question', 'AAVE Question')
        ]
    },
    'SST-2': {
        'translation_files': [
            ('/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/SST-2/aligned_sentences.csv', 'VALUE Sentence', 'AAVE Sentence')
        ]
    },
    'WSC': {
        'translation_files': [
            ('/content/drive/MyDrive/Algoverse/New Results/VALUE Translations/WSC/aligned_paragraphs.csv', 'VALUE Paragraph', 'AAVE Paragraph')
        ]
    }
}

# Process each dataset for each GPT model
gpt_models = ['gpt-4o', 'gpt-4o-mini', 'gpt-4-turbo']
for model_name in gpt_models:
    main_directory = f'/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/{model_name}'
    for dataset_name, info in datasets.items():
        process_gpt_dataset(model_name, main_directory, dataset_name, info['translation_files'])

# Process each dataset for each Gemini model
gemini_models = ['gemini-1.5-flash', 'gemini-1.5-pro']
for model_name in gemini_models:
    main_directory = f'/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/{model_name}'
    for dataset_name, info in datasets.items():
        process_gemini_dataset(model_name, main_directory, dataset_name, info['translation_files'])

print("All dataset comparisons completed for all models.")

In [None]:
import pandas as pd
import os

# Define a function to calculate percentages
def calculate_percentages(comparison_results):
    total = len(comparison_results)
    # Use the correct interpretation of results: 'Translation 1' -> 'AAVE' and 'Translation 2' -> 'VALUE'
    aave_count = sum(1 for result in comparison_results if result.strip() == 'Translation 1')
    value_count = sum(1 for result in comparison_results if result.strip() == 'Translation 2')
    # Any result not matching 'Translation 1' or 'Translation 2' is considered 'About the same'
    about_same_count = total - (aave_count + value_count)

    aave_percentage = (aave_count / total) * 100 if total > 0 else 0
    value_percentage = (value_count / total) * 100 if total > 0 else 0
    about_same_percentage = (about_same_count / total) * 100 if total > 0 else 0

    return aave_percentage, value_percentage, about_same_percentage

# Define a function to process the results and save percentages to a text file
def process_results_and_save_txt(model_name, dataset_name, file_paths):
    results = []

    # Process each comparison result file
    for file_index, file_path in enumerate(file_paths):
        try:
            df = pd.read_csv(file_path)
            # Assume the comparison results are in the last column
            last_column_name = df.columns[-1]
            comparison_results = df[last_column_name].dropna().tolist()
            aave_percentage, value_percentage, about_same_percentage = calculate_percentages(comparison_results)
            results.append((last_column_name, aave_percentage, value_percentage, about_same_percentage))
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            continue

    # Define the output file path
    main_directory = f'/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/{model_name}/{dataset_name}'
    os.makedirs(main_directory, exist_ok=True)
    output_file_path = os.path.join(main_directory, f'{dataset_name}_percentages.txt')

    # Write the results to a text file
    with open(output_file_path, 'w') as file:
        for column_name, aave_percentage, value_percentage, about_same_percentage in results:
            file.write(f"{column_name} Comparison Scores:\n")
            file.write(f"AAVE (Translation 1): {aave_percentage:.2f}%\n")
            file.write(f"VALUE (Translation 2): {value_percentage:.2f}%\n")
            file.write(f"About the same: {about_same_percentage:.2f}%\n\n")

    print(f"Percentages saved to text file at {output_file_path}")

# Paths for the comparison results for each model and dataset
comparison_paths = {
    'gpt-4o': {
        'BoolQ': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o/BoolQ/BoolQ_comparison_results_0.csv'],
        'COPA': [
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o/COPA/COPA_comparison_results_0.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o/COPA/COPA_comparison_results_1.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o/COPA/COPA_comparison_results_2.csv'
        ],
        'MultiRC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o/MultiRC/MultiRC_comparison_results_0.csv'],
        'SST-2': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o/SST-2/SST-2_comparison_results_0.csv'],
        'WSC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o/WSC/WSC_comparison_results_0.csv']
    },
    'gpt-4o-mini': {
        'BoolQ': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o-mini/BoolQ/BoolQ_comparison_results_0.csv'],
        'COPA': [
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o-mini/COPA/COPA_comparison_results_0.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o-mini/COPA/COPA_comparison_results_1.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o-mini/COPA/COPA_comparison_results_2.csv'
        ],
        'MultiRC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o-mini/MultiRC/MultiRC_comparison_results_0.csv'],
        'SST-2': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o-mini/SST-2/SST-2_comparison_results_0.csv'],
        'WSC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4o-mini/WSC/WSC_comparison_results_0.csv']
    },
    'gpt-4-turbo': {
        'BoolQ': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4-turbo/BoolQ/BoolQ_comparison_results_0.csv'],
        'COPA': [
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4-turbo/COPA/COPA_comparison_results_0.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4-turbo/COPA/COPA_comparison_results_1.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4-turbo/COPA/COPA_comparison_results_2.csv'
        ],
        'MultiRC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4-turbo/MultiRC/MultiRC_comparison_results_0.csv'],
        'SST-2': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4-turbo/SST-2/SST-2_comparison_results_0.csv'],
        'WSC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gpt-4-turbo/WSC/WSC_comparison_results_0.csv']
    },
    'gemini-1.5-flash': {
        'BoolQ': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-flash/BoolQ/BoolQ_comparison_results_0.csv'],
        'COPA': [
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-flash/COPA/COPA_comparison_results_0.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-flash/COPA/COPA_comparison_results_1.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-flash/COPA/COPA_comparison_results_2.csv'
        ],
        'MultiRC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-flash/MultiRC/MultiRC_comparison_results_0.csv'],
        'SST-2': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-flash/SST-2/SST-2_comparison_results_0.csv'],
        'WSC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-flash/WSC/WSC_comparison_results_0.csv']
    },
    'gemini-1.5-pro': {
        'BoolQ': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-pro/BoolQ/BoolQ_comparison_results_0.csv'],
        'COPA': [
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-pro/COPA/COPA_comparison_results_0.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-pro/COPA/COPA_comparison_results_1.csv',
            '/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-pro/COPA/COPA_comparison_results_2.csv'
        ],
        'MultiRC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-pro/MultiRC/MultiRC_comparison_results_0.csv'],
        'SST-2': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-pro/SST-2/SST-2_comparison_results_0.csv'],
        'WSC': ['/content/drive/MyDrive/Algoverse/New Results/Comparison Scores/gemini-1.5-pro/WSC/WSC_comparison_results_0.csv']
    }
}

# Process and save percentages for each model and dataset
for model_name, datasets in comparison_paths.items():
    for dataset_name, file_paths in datasets.items():
        process_results_and_save_txt(model_name, dataset_name, file_paths)

print("Percentage calculations completed for all models and datasets.")