# Criterias

In [None]:
import torch
import torch.nn.functional as F
import random
import re
from transformers import AutoTokenizer, AutoModel

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import ast
import json
from collections import defaultdict

In [None]:
df = pd.read_csv("/kaggle/input/questions/questions.csv", encoding="utf-8")

In [None]:
with open('/kaggle/input/phi-3-dataset/questions.json', 'r') as file:
    data_phi = json.load(file)

In [None]:
phi_questions = [ast.literal_eval(q) for q in data_phi]

In [None]:
unwanted_options = ['A, B', 'E']

phi_questions = [question for question in phi_questions if question['correct_option'] not in unwanted_options]

In [None]:
df

In [None]:
len(phi_questions)

In [None]:
questions_with_folder = []
for _, row in df.iterrows():
    try:
        # Parse the 'question' field as JSON
        question_data = json.loads(row['question'])
    except json.JSONDecodeError:
        question_data = row['question']
    
    questions_with_folder.append({
        "folder": row['folder'],
        "content": row['content'],
        "question": question_data
    })

In [None]:
for item in phi_questions:
    question_index = item['index']
    
    item['folder'] = questions_with_folder[question_index]['folder']
    item['content'] = questions_with_folder[question_index]['content']

In [None]:
questions = []
for item in questions_with_folder:
    question_data = item['question']
    question_data = ast.literal_eval(question_data)
    question_data['folder'] = item['folder']
    questions.append(question_data)

In [None]:
def remove_double_bracketed_text(text):
    # Use regex to find and remove text within {{...}}
    cleaned_text = re.sub(r"\{\{.*?\}\}", "", text, flags=re.DOTALL)
    return cleaned_text.strip()

In [None]:
cleaned_txt_contents = [remove_double_bracketed_text(text['content']) for text in questions_with_folder]

# Length

In [None]:
def calculate_option_lengths(questions: list) -> pd.DataFrame:
    results = []
    
    for question_data in questions:
        question = question_data.get('question', '')
        option_a = question_data.get('option_a', '')
        option_b = question_data.get('option_b', '')
        option_c = question_data.get('option_c', '')
        option_d = question_data.get('option_d', '')
        correct_option = question_data.get('correct_option', '')

        option_lengths = {
            'question': question,
            'question_length': len(question),
            'option_a_length': len(option_a),
            'option_b_length': len(option_b),
            'option_c_length': len(option_c),
            'option_d_length': len(option_d),
            'correct_option': correct_option
        }
        
        results.append(option_lengths)
    
    return pd.DataFrame(results)

In [None]:
df_lengths_gpt = calculate_option_lengths(questions)

In [None]:
df_lengths_phi = calculate_option_lengths(phi_questions)

In [None]:
plt.figure(figsize=(14, 6))

# Plot for GPT question lengths
plt.subplot(1, 2, 1)
sns.histplot(df_lengths_gpt['question_length'], bins=20, color="skyblue", edgecolor="black")
plt.title("Distribution of GPT Question Lengths")
plt.xlabel("Question Length")
plt.ylabel("Frequency")

# Plot for Phi question lengths
plt.subplot(1, 2, 2)
sns.histplot(df_lengths_phi['question_length'], bins=20, color="salmon", edgecolor="black")
plt.title("Distribution of Phi Question Lengths")
plt.xlabel("Question Length")
plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
options = ['option_a_length', 'option_b_length', 'option_c_length', 'option_d_length']

plt.figure(figsize=(16, 8))  # Adjust the figure size as needed

# Loop through each option and create a histogram
for i, option in enumerate(options, 1):
    plt.subplot(2, 2, i)  # Create a 2x2 grid for subplots
    sns.histplot(df_lengths_gpt[option], bins=20, color="skyblue", edgecolor="black")
    plt.title(f"Distribution of {option.capitalize().replace('_', ' ')}")
    plt.xlabel("Option Length")
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
options = ['option_a_length', 'option_b_length', 'option_c_length', 'option_d_length']

plt.figure(figsize=(16, 8))  # Adjust the figure size as needed

# Loop through each option and create a histogram
for i, option in enumerate(options, 1):
    plt.subplot(2, 2, i)  # Create a 2x2 grid for subplots
    sns.histplot(df_lengths_phi[option], bins=20, color="skyblue", edgecolor="black")
    plt.title(f"Distribution of {option.capitalize().replace('_', ' ')}")
    plt.xlabel("Option Length")
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
df_lengths_gpt['length_diff'] = df_lengths_gpt.apply(
    lambda row: max(row['option_a_length'], row['option_b_length'], row['option_c_length'], row['option_d_length']) -
                min(row['option_a_length'], row['option_b_length'], row['option_c_length'], row['option_d_length']),
    axis=1
)

# Plot the distribution of length differences
plt.figure(figsize=(10, 6))
sns.histplot(df_lengths_gpt['length_diff'], bins=20, color="skyblue", edgecolor="black")
plt.title("GPT4, Distribution of Length Differences Between Longest and Shortest Options")
plt.xlabel("Length Difference (Longest - Shortest)")
plt.ylabel("Frequency")
plt.show()

In [None]:
df_lengths_phi['length_diff'] = df_lengths_phi.apply(
    lambda row: max(row['option_a_length'], row['option_b_length'], row['option_c_length'], row['option_d_length']) -
                min(row['option_a_length'], row['option_b_length'], row['option_c_length'], row['option_d_length']),
    axis=1
)


plt.figure(figsize=(10, 6))
sns.histplot(df_lengths_phi['length_diff'], bins=20, color="skyblue", edgecolor="black")
plt.title("Phi3, Distribution of Length Differences Between Longest and Shortest Options")
plt.xlabel("Length Difference (Longest - Shortest)")
plt.ylabel("Frequency")
plt.show()

In [None]:
df_lengths_gpt.correct_option = df_lengths_gpt.correct_option.str.replace("option_b", 'b')
df_lengths_gpt.correct_option = df_lengths_gpt.correct_option.str.replace("option_c", 'c')

In [None]:
df_lengths_gpt['correct_option_length'] = df_lengths_gpt.apply(
    lambda row: row[f"option_{row['correct_option'].lower()}_length"], axis=1
)

In [None]:
df_lengths_phi = df_lengths_phi[~df_lengths_phi['correct_option'].isin(['A, B', 'E'])]

In [None]:
df_lengths_phi['correct_option_length'] = df_lengths_phi.apply(
    lambda row: row[f"option_{row['correct_option'].lower()}_length"], axis=1
)

In [None]:
def analyze_correct_vs_incorrect_option_length(df):
    # Create the `correct_option_length` column if not already present
    df['correct_option_length'] = df.apply(
        lambda row: row[f"option_{row['correct_option'].lower()}_length"], axis=1
    )

    def avg_incorrect_length(row):
        incorrect_options = [
            row['option_a_length'],
            row['option_b_length'],
            row['option_c_length'],
            row['option_d_length']
        ]
        incorrect_options.remove(row['correct_option_length'])
        return sum(incorrect_options) / len(incorrect_options)
    
    # Apply the function to create a new column for average incorrect length
    df['avg_incorrect_option_length'] = df.apply(avg_incorrect_length, axis=1)

    # Calculate the correlation between correct and average incorrect option lengths
    length_correlation = df['correct_option_length'].corr(df['avg_incorrect_option_length'])

    # Check the proportion of times the correct answer is longer than each incorrect option
    correct_longer_proportion = df.apply(
        lambda row: row['correct_option_length'] > row['avg_incorrect_option_length'],
        axis=1
    ).mean() * 100  # Convert to percentage
    
    return length_correlation, correct_longer_proportion

In [None]:
# Gpt 4
length_correlation, correct_longer_proportion = analyze_correct_vs_incorrect_option_length(df_lengths_gpt)


print("Correlation between correct and average incorrect option lengths:", length_correlation)
print("Proportion of times the correct option is longer than the average incorrect options:", correct_longer_proportion, "%")

In [None]:
"Proportion of times the correct option is longer than the min/max incorrect options"

In [None]:
#phi 3 
length_correlation, correct_longer_proportion = analyze_correct_vs_incorrect_option_length(df_lengths_phi)


print("Correlation between correct and average incorrect option lengths:", length_correlation)
print("Proportion of times the correct option is longer than the average incorrect options:", correct_longer_proportion, "%")

# Question check

In [None]:
def is_question(sentence):
    if sentence.strip().endswith("?"):
        return True
    
    # Check for common question words at the start of the sentence (case insensitive)
    question_words = r"^(who|what|where|when|why|how|does|should|do|did|could|will|would)\b"
    if re.match(question_words, sentence.strip(), re.IGNORECASE):
        return True
    
    return False

In [None]:
question_count = {'True': 0, 'False': 0}

for item in phi_questions:
    question_text = item['question']
    
    # Count results for is_question
    if is_question(question_text):
        question_count['True'] += 1
    else:
        question_count['False'] += 1

question_count

# Starts with negation

In [None]:
def starts_with_negation(sentence):
    negation_words = r"^(not|no|don't|doesn't|isn't|aren't|wasn't|weren't|won't|can't|couldn't|shouldn't|wouldn't|didn't|haven't|hasn't|hadn't|mustn't)\b"
    
    if re.match(negation_words, sentence.strip(), re.IGNORECASE):
        return True
    return False

In [None]:
negation_count = {'True': 0, 'False': 0}

# Check each question for being a question and starting with negation
for item in phi_questions:
    question_text = item['question']
    
    # Count results for starts_with_negation
    if starts_with_negation(question_text):
        negation_count['True'] += 1
    else:
        negation_count['False'] += 1

negation_count

# Model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model_name = "BAAI/bge-base-en-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# relevance to lisa sheet

In [None]:
def generate_embeddings(texts, model, tokenizer, device, batch_size=8):
    all_embeddings = []
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        
        all_embeddings.append(embeddings.cpu().numpy())
    return np.concatenate(all_embeddings, axis=0)

In [None]:
%%time
lisa_embeddings = generate_embeddings(cleaned_txt_contents, model, tokenizer, device)

question_texts = [full_question['question'] for full_question in questions]
question_embeddings = generate_embeddings(question_texts, model, tokenizer, device)

In [None]:
lisa_texts = [q['content'] for q in phi_questions]
lisa_embeddings_phi = generate_embeddings(lisa_texts, model, tokenizer, device)

question_texts = [q['question'] for q in phi_questions]
question_embeddings_phi = generate_embeddings(question_texts, model, tokenizer, device)

In [None]:
def to_tensor(embedding):
    if isinstance(embedding, np.ndarray):
        return torch.tensor(embedding)
    return embedding

def cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1, embedding2, dim=0).item()

In [None]:
question_folder_embeddings

In [None]:
selected_folders = list(question_folder_embeddings.keys())[:6]  # Define selected folders

# Dictionary to store similarities for each folder
folder_direct_similarities = {}

for folder in selected_folders:
    # Ensure both embeddings have the same number of items
    lisa_embeds = lisa_folder_embeddings[folder]
    question_embeds = question_folder_embeddings[folder]
    
    if len(lisa_embeds) == len(question_embeds):
        # Calculate pairwise cosine similarity for matching indices
        direct_similarities = [
            cosine_similarity(lisa, question)
            for lisa, question in zip(lisa_embeds, question_embeds)
        ]
        folder_direct_similarities[folder] = direct_similarities
    else:
        print(f"Folder {folder} has mismatched embedding counts.")

# Plot cosine similarity distributions for selected folders
plt.figure(figsize=(14, 8))

for i, folder in enumerate(selected_folders, 1):
    plt.subplot(2, 3, i)  # Create a 2x3 grid for subplots
    sns.histplot(folder_direct_similarities[folder], bins=20, color="skyblue", edgecolor="black")
    plt.title(f"Phi-3.5, calculate relevance in Folder: {folder}")
    plt.xlabel("Cosine Similarity")
    plt.ylabel("Frequency")

plt.tight_layout()
plt.savefig("Phi-3.5_calculate_relevance.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
lisa_folder_embeddings_gpt4o = defaultdict(list)
for item, embedding in zip(questions_with_folder, lisa_embeddings):
    folder = item['folder']
    lisa_folder_embeddings_gpt4o[folder].append(to_tensor(embedding))

question_folder_embeddings_gpt4o = defaultdict(list)
for item, embedding in zip(questions, question_embeddings):
    folder = item['folder']
    question_folder_embeddings_gpt4o[folder].append(to_tensor(embedding))

# Calculate cosine similarity for matching indices for the same folders as phi3.5
folder_direct_similarities_gpt4o = {}

for folder in selected_folders:  # Reuse the selected folders from phi3.5
    lisa_embeds = lisa_folder_embeddings_gpt4o[folder]
    question_embeds = question_folder_embeddings_gpt4o[folder]
    
    if len(lisa_embeds) == len(question_embeds):
        # Calculate pairwise cosine similarity for matching indices
        direct_similarities = [
            cosine_similarity(lisa, question)
            for lisa, question in zip(lisa_embeds, question_embeds)
        ]
        folder_direct_similarities_gpt4o[folder] = direct_similarities
    else:
        print(f"Folder {folder} has mismatched embedding counts.")

# Plot cosine similarity distributions for selected folders (GPT4o)
plt.figure(figsize=(14, 8))

for i, folder in enumerate(selected_folders, 1):
    plt.subplot(2, 3, i)  # Create a 2x3 grid for subplots
    sns.histplot(folder_direct_similarities_gpt4o[folder], bins=20, color="lightcoral", edgecolor="black")
    plt.title(f"GPT4o, calculate relevance in Folder: {folder}")
    plt.xlabel("Cosine Similarity")
    plt.ylabel("Frequency")

plt.tight_layout()
plt.savefig("GPT4o_calculate_relevance.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# check relevance question to each .txt filde withoing folder

In [None]:
# Calculate ordered cosine similarities
def calculate_cosine_similarities_ordered(lisa_embeddings, question_embeddings):
    lisa_embeddings = torch.tensor(lisa_embeddings)
    question_embeddings = torch.tensor(question_embeddings)

    ordered_similarities = [
        cosine_similarity(lisa, question)
        for lisa, question in zip(lisa_embeddings, question_embeddings)
    ]
    return ordered_similarities


def calculate_cosine_similarities_random(lisa_embeddings, question_embeddings):
    # Ensure both embeddings are torch tensors
    lisa_embeddings = torch.tensor(lisa_embeddings)
    question_embeddings = torch.tensor(question_embeddings)
    
    # Shuffle question_embeddings randomly
    random_question_embeddings = question_embeddings[torch.randperm(len(question_embeddings))]
    
    # Calculate cosine similarities in the random order
    random_similarities = [
        cosine_similarity(lisa, question)
        for lisa, question in zip(lisa_embeddings, random_question_embeddings)
    ]
    return random_similarities

In [None]:
ordered_similarities = calculate_cosine_similarities_ordered(lisa_embeddings, question_embeddings)
random_similarities = calculate_cosine_similarities_random(lisa_embeddings, question_embeddings)

In [None]:
def plot_similarity_distributions(ordered_similarities, random_similarities):
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.hist(ordered_similarities, bins=20, color='blue', alpha=0.7, edgecolor='black')
    plt.title('Cosine Similarity Distribution - Input sheet')
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    plt.hist(random_similarities, bins=20, color='orange', alpha=0.7, edgecolor='black')
    plt.title('Cosine Similarity Distribution - Random')
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()

In [None]:
#GPT4
plot_similarity_distributions(ordered_similarities, random_similarities)

In [None]:
# PHI-3
ordered_similarities_phi = calculate_cosine_similarities_ordered(lisa_embeddings_phi, question_embeddings_phi)
random_similarities_phi = calculate_cosine_similarities_random(lisa_embeddings_phi, question_embeddings_phi)

plot_similarity_distributions(ordered_similarities_phi, random_similarities_phi)

In [None]:
def cosine_similarity(embedding1, embedding2):
    return F.cosine_similarity(embedding1.unsqueeze(0), embedding2.unsqueeze(0)).item()

In [None]:
def calculate_pairwise_option_similarities(questions, model, tokenizer, device="cuda"):
    results = []
    
    for question_data in questions:
        # Ensure options are a flat list of strings
        options = [
            question_data['option_a'],
            question_data['option_b'],
            question_data['option_c'],
            question_data['option_d']
        ]

        # Generate embeddings for the options (flat list of strings)
        embeddings = generate_embeddings(options, model, tokenizer, device)
        
        similarities = {
            "a_b": cosine_similarity(torch.tensor(embeddings[0]), torch.tensor(embeddings[1])),
            "a_c": cosine_similarity(torch.tensor(embeddings[0]), torch.tensor(embeddings[2])),
            "a_d": cosine_similarity(torch.tensor(embeddings[0]), torch.tensor(embeddings[3])),
            "b_c": cosine_similarity(torch.tensor(embeddings[1]), torch.tensor(embeddings[2])),
            "b_d": cosine_similarity(torch.tensor(embeddings[1]), torch.tensor(embeddings[3])),
            "c_d": cosine_similarity(torch.tensor(embeddings[2]), torch.tensor(embeddings[3]))
        }

        results.append({
            "question": question_data['question'],
            "similarities": similarities,
            "correct_option": question_data['correct_option']
        })
    
    return results

In [None]:
option_similarities = calculate_pairwise_option_similarities(questions, model, tokenizer)

In [None]:
option_similarities_phi = calculate_pairwise_option_similarities(phi_questions, model, tokenizer)

In [None]:
def plot_similarity_histogram(results):
    # Collect all similarity values
    similarities = []
    for result in results:
        similarities.extend(result["similarities"].values())
    
    # Plot histogram
    plt.figure(figsize=(10, 6))
    plt.hist(similarities, bins=20, color='skyblue', edgecolor='black')
    plt.title("Distribution of Option Pair Similarities Across Questions")
    plt.xlabel("Cosine Similarity")
    plt.ylabel("Frequency")
    plt.show()

# GPT4


In [None]:
plot_similarity_histogram(option_similarities)

In [None]:
#PHI3
plot_similarity_histogram(option_similarities_phi)

In [None]:
def plot_ambiguity_similarity_boxplot(results, model_name):
    """
    Plot ambiguity (1 - cosine similarity) for each option pair across questions.

    Parameters:
    - results: List of dictionaries with "similarities" and "Pair" keys.
    - model_name: Name of the model (e.g., "GPT4o", "Phi3.5") to label the plot.
    """
    data = {"Pair": [], "Ambiguity": []}
    
    # Transform similarities into ambiguity (1 - cosine similarity)
    for result in results:
        for pair, similarity in result["similarities"].items():
            data["Pair"].append(pair)
            data["Ambiguity"].append(1 - similarity)
    
    df = pd.DataFrame(data)
    
    # Calculate the median ambiguity for each pair and sort in descending order
    median_sorted_pairs = df.groupby("Pair")["Ambiguity"].median().sort_values(ascending=False).index
    
    # Plot box plot with sorted pairs in descending order of median ambiguity
    plt.figure(figsize=(12, 6))
    sns.boxplot(x="Pair", y="Ambiguity", data=df, order=median_sorted_pairs)
    plt.title(f"Ambiguity (1 - Cosine Similarity) by Option Pair ({model_name})")
    plt.xlabel("Option Pair")
    plt.ylabel("Ambiguity (1 - Cosine Similarity)")
    plt.tight_layout()
    plt.savefig(f"ambiguity_{model_name}.png", dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# # GPT4
plot_ambiguity_similarity_boxplot(option_similarities, "GPT4o")

In [None]:
# PHI3
plot_ambiguity_similarity_boxplot(option_similarities_phi, "Phi3.5")

In [None]:
def calculate_correct_incorrect_similarities(questions, model, tokenizer, device="cuda"):
    results = []
    
    for question_data in questions:
        # Extract options
        options = {
            "a": question_data['option_a'],
            "b": question_data['option_b'],
            "c": question_data['option_c'],
            "d": question_data['option_d']
        }
        
        embeddings = generate_embeddings(list(options.values()), model, tokenizer, device)
        embeddings_dict = dict(zip(options.keys(), embeddings))  # Map option letters to embeddings
        
        correct_opt = question_data['correct_option'].lower()
        if 'option' in correct_opt:
            continue
        correct_embedding = embeddings_dict[correct_opt]

        
        correct_incorrect_similarities = {}
        for opt, embedding in embeddings_dict.items():
            if opt != correct_opt:  # Only compare with incorrect options
                pair_label = f"{correct_opt}_{opt}"  # Correct option always comes first
                correct_incorrect_similarities[pair_label] = cosine_similarity(
                    torch.tensor(correct_embedding), torch.tensor(embedding)
                )
        
        results.append({
            "question": question_data['question'],
            "similarities": correct_incorrect_similarities,
            "correct_option": question_data['correct_option']
        })
    
    return results


correct_incorrect_similarities = calculate_correct_incorrect_similarities(questions, model, tokenizer)

In [None]:
def plot_correct_incorrect_similarity_boxplot(results):
    data = {"Pair": [], "Cosine Similarity": []}
    for result in results:
        for pair, similarity in result["similarities"].items():
            data["Pair"].append(pair)
            data["Cosine Similarity"].append(similarity)
    
    df = pd.DataFrame(data)
    
    # Calculate the median cosine similarity for each pair and sort in descending order
    median_sorted_pairs = df.groupby("Pair")["Cosine Similarity"].median().sort_values(ascending=False).index
    
    # Plot the box plot with pairs sorted by median similarity in descending order
    plt.figure(figsize=(12, 6))
    sns.boxplot(x="Pair", y="Cosine Similarity", data=df, order=median_sorted_pairs)
    plt.title("Cosine Similarity between Correct and Incorrect Options (Correct Option First, Sorted by Median)")
    plt.xlabel("Option Pair (Sorted by Median Similarity)")
    plt.ylabel("Cosine Similarity")
    plt.show()

In [None]:
#plot_correct_incorrect_similarity_boxplot(correct_incorrect_similarities)

In [None]:
correct_incorrect_similarities_phi = calculate_correct_incorrect_similarities(phi_questions, model, tokenizer)

In [None]:
#Phi3
plot_correct_incorrect_similarity_boxplot(correct_incorrect_similarities_phi)