In [1]:
import os
from openai import OpenAI
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json

# Load the training data and generated questions data
with open('formatted_train.jsonl', 'r') as f:
    training_data = [json.loads(line) for line in f]

with open('generated_test_results_few_shot.jsonl', 'r') as f:
    generated_data = [json.loads(line) for line in f]

# Define a function to extract combined question and choices text for comparison
def extract_combined_text(data_entry):
    """
    Extract the combined question and choices text from the data entry for comparison, ignoring the answer.

    Args:
        data_entry (dict): A single data entry from the dataset.

    Returns:
        str: The extracted combined text.
    """
    if 'output' in data_entry:
        output_text = data_entry['output']
        question_start = output_text.find("Question:") + len("Question:")
        choices_start = output_text.find("Choices:")
        question_text = output_text[question_start:choices_start].strip()
        choices_text = output_text[choices_start:].strip()  # Include choices but ignore the answer
        return f"{question_text} {choices_text}"
    elif 'generated_response' in data_entry:
        response_text = data_entry['generated_response']
        question_start = response_text.find("Question:") + len("Question:")
        choices_start = response_text.find("Choices:")
        question_text = response_text[question_start:choices_start].strip()
        choices_text = response_text[choices_start:].strip()  # Include choices but ignore the answer
        return f"{question_text} {choices_text}"
    else:
        return ""

# Define a function to calculate similarity
def calculate_similarity(text1, text2):
    """
    Calculate cosine similarity between two text inputs.

    Args:
        text1 (str): First text input.
        text2 (str): Second text input.

    Returns:
        float: Cosine similarity score.
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([text1, text2])
    similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return similarity[0][0]

# Compare generated questions with training data for similarity
results = []
total_similarity = 0
num_comparisons = 0
max_similarity_overall = 0.0

for gen_entry in generated_data:
    gen_input = gen_entry.get('input', '')
    gen_combined = extract_combined_text(gen_entry)

    max_similarity = 0.0
    most_similar_training_question = ""

    for train_entry in training_data:
        train_input = train_entry.get('input', '')

        if gen_input == train_input:  # Match based on the same input
            train_combined = extract_combined_text(train_entry)

            similarity = calculate_similarity(gen_combined, train_combined)

            if similarity > max_similarity:
                max_similarity = similarity
                most_similar_training_question = train_combined

            total_similarity += similarity
            num_comparisons += 1

    # Update overall max similarity
    if max_similarity > max_similarity_overall:
        max_similarity_overall = max_similarity

    # Store only the highest similarity
    results.append({
        "Generated Question": gen_combined,
        "Most Similar Training Question": most_similar_training_question,
        "Highest Similarity Percentage": round(max_similarity * 100, 2)
    })

# Calculate the overall average similarity
average_similarity = (total_similarity / num_comparisons) * 100 if num_comparisons > 0 else 0

# Save the results to a JSON file
output = {
    "Results": results,
    "Overall Average Similarity Percentage": round(average_similarity, 2),
    "Highest Similarity Overall Percentage": round(max_similarity_overall * 100, 2)
}

with open('Similarity_Analysis_few_shot_with_training.jsonl', 'w') as f:
    json.dump(output, f, indent=4)

print("Similarity analysis complete. Results saved to Similarity_Analysis_few_shot_with_training.jsonl")


Similarity analysis complete. Results saved to Similarity_Analysis_few_shot_with_training.jsonl


In [7]:
import os
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load JSONL files
def load_jsonl(filename):
    """Load a JSONL file into a list of dictionaries."""
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                data.append(json.loads(line.strip()))
            except json.JSONDecodeError:
                print(f"Skipping malformed JSON line in {filename}")
    return data

# Load datasets
training_data = load_jsonl('generated_test_results_FULL_LLaMA.jsonl')
generated_data = load_jsonl('generated_test_results_few_shot_LLaMA.jsonl')

# Function to extract clean question text
def extract_combined_text(data_entry):
    """Extract and normalize the question and choices from a data entry."""
    output_text = data_entry.get('output', data_entry.get('generated_response', ''))
    
    if not output_text:
        return ""

    # Ensure case normalization and consistent spacing
    output_text = output_text.strip().lower()

    question_start = output_text.find("question:")
    choices_start = output_text.find("choices:")

    if question_start == -1 or choices_start == -1:
        return output_text  # Return full text if markers are missing

    question_text = output_text[question_start + len("question:"):choices_start].strip()
    choices_text = output_text[choices_start:].strip()

    return f"{question_text} {choices_text}".replace("\n", " ").strip()

# Extract and preprocess text for training and generated data
training_texts = [extract_combined_text(entry) for entry in training_data]
generated_texts = [extract_combined_text(entry) for entry in generated_data]

# Ensure we only compare valid entries
training_texts = [text for text in training_texts if text]
generated_texts = [text for text in generated_texts if text]

# Vectorize all questions together
vectorizer = TfidfVectorizer()
tfidf_all = vectorizer.fit_transform(training_texts + generated_texts)

# Separate training and generated TF-IDF matrices
tfidf_train = tfidf_all[:len(training_texts)]
tfidf_gen = tfidf_all[len(training_texts):]

# Compute full pairwise similarity matrix
similarity_matrix = cosine_similarity(tfidf_gen, tfidf_train)

# Compare each generated question with all training questions
results = []
total_similarity = 0
num_comparisons = 0
max_similarity_overall = 0.0

for gen_idx, gen_entry in enumerate(generated_data):
    gen_combined = generated_texts[gen_idx]
    
    # Get similarities for this generated question
    similarities = similarity_matrix[gen_idx]
    
    # Sort similarities in descending order
    sorted_indices = similarities.argsort()[::-1]
    
    most_similar_train_questions = []
    for idx in sorted_indices[:3]:  # Store top 3 most similar questions
        most_similar_train_questions.append({
            "Training Question": training_texts[idx],
            "Similarity Percentage": round(similarities[idx] * 100, 2)
        })
    
    # Update statistics
    max_similarity = max(similarities) if len(similarities) > 0 else 0.0
    max_similarity_overall = max(max_similarity_overall, max_similarity)
    total_similarity += sum(similarities)
    num_comparisons += len(similarities)
    
    # Store results
    results.append({
        "Generated Question": gen_combined,
        "Most Similar Training Questions": most_similar_train_questions
    })

# Calculate overall average similarity
average_similarity = round((total_similarity / num_comparisons) * 100, 2) if num_comparisons > 0 else 0

# Save the results to a JSONL file
output_filename = "Generated_Questions_Similarity_Analysis_few_shot_LLaMA_with_generated_fULL_LLaMA.jsonl"

with open(output_filename, 'w', encoding='utf-8') as f:
    for result in results:
        f.write(json.dumps(result) + "\n")

# Save overall statistics separately
summary_filename = "Generated_Questions_Similarity_Analysis_few_shot_LLaMA_with_generated_fULL_LLaMA_summary.jsonl"
summary_output = {
    "Overall Average Similarity Percentage": average_similarity,
    "Highest Similarity Overall Percentage": round(max_similarity_overall * 100, 2)
}

with open(summary_filename, 'w', encoding='utf-8') as f:
    json.dump(summary_output, f, indent=4)

print(f"✅ Similarity analysis complete. Results saved to {output_filename} and summary to {summary_filename}.")


✅ Similarity analysis complete. Results saved to Generated_Questions_Similarity_Analysis_few_shot_LLaMA_with_generated_fULL_LLaMA.jsonl and summary to Generated_Questions_Similarity_Analysis_few_shot_LLaMA_with_generated_fULL_LLaMA_summary.jsonl.
