In [None]:
'''from google.colab import drive
drive.mount('/content/drive')'''

In [None]:
# Define evaluation queries

queries = [
    "Is limits of human knowledge compatible with human intuition?",
    "How does the role of imagination shape our understanding of the universe?",
    "Can line spectra be explained without contradictions?",
    "Can entanglement be explained without contradictions?",
    "Could we have discovered spectral lines without experiments?",
    "Why is geometry and experience considered revolutionary?",
    "Could we have discovered quantum theory without experiments?",
    "How does constitution of radiation relate to our everyday experience?",
    "What does limits of human knowledge tell us about reality?",
    "What makes scientific method a cornerstone of modern science?",
    "What does international cooperation in science tell us about reality?",
    "What does gravity tell us about reality?",
    "Could we have discovered international cooperation in science without experiments?",
    "Does quantum jumps challenge classical physics?",
    "How does philosophy of quantum physics shape our understanding of the universe?",
    "Can international cooperation in science be explained without contradictions?",
    "What are the philosophical implications of philosophy of science?",
    "Is interference compatible with human intuition?",
    "How does philosophy of science relate to our everyday experience?",
    "Could we have discovered relativity without experiments?",
    "How does inertia shape our understanding of the universe?",
    "What does line spectra tell us about reality?",
    "Does scientific method challenge classical physics?",
    "How does role of the observer shape our understanding of the universe?",
    "Can complementarity be explained without contradictions?",
    "Is pacifism compatible with human intuition?",
    "Could we have discovered interference without experiments?",
    "How does entanglement relate to our everyday experience?",
    "Can the role of imagination be explained without contradictions?",
    "What makes relativity a cornerstone of modern science?",
    "Can philosophy of quantum physics be explained without contradictions?",
    "Could we have discovered complementarity without experiments?",
    "Is Brownian motion compatible with human intuition?",
    "Could we have discovered wave-particle duality without experiments?",
    "What are the philosophical implications of atomic structure?",
    "Could we have discovered line spectra without experiments?",
    "What are the philosophical implications of complementarity?",
    "Could we have discovered causality without experiments?",
    "Why is measurement problem considered revolutionary?",
    "Is space and time compatible with human intuition?",
    "Can quantum jumps be explained without contradictions?",
    "How does line spectra shape our understanding of the universe?",
    "Why is line spectra considered revolutionary?",
    "Why is atomic structure considered revolutionary?",
    "Could we have discovered pacifism without experiments?",
    "Can probabilistic reality be explained without contradictions?",
    "Is inertia compatible with human intuition?",
    "What does space and time tell us about reality?",
    "Why is causality considered revolutionary?",
    "Is geometry and experience compatible with human intuition?",
    "What makes causality in physics a cornerstone of modern science?",
    "What makes gravity a cornerstone of modern science?",
    "Why is spectral lines considered revolutionary?",
    "How does observation in physics relate to our everyday experience?",
    "How does nature of light relate to our everyday experience?",
    "How does atomic structure relate to our everyday experience?",
    "How does wave-particle duality shape our understanding of the universe?",
    "How does social responsibility of science relate to our everyday experience?",
    "Why is international cooperation in science considered revolutionary?",
    "Does gravity challenge classical physics?"
]

In [None]:
import numpy as np
import pandas as pd
import time
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors

# Load cleaned Bohr corpus and compute mean style embedding
CLEANED_PATH_BOHR = 'bohr_cleaned_final.txt'
with open(CLEANED_PATH_BOHR, 'r', encoding='utf-8') as f:
    lines_bohr = [l.strip() for l in f if l.strip()]

# Load cleaned Einstein corpus and compute mean style embedding
CLEANED_PATH_EINSTEIN = 'einstein_cleaned_final.txt'
with open(CLEANED_PATH_EINSTEIN, 'r', encoding='utf-8') as f:
    lines_einstein = [l.strip() for l in f if l.strip()]


# Initialize embedder and compute mean embedding of a random sample of passages
embedder = SentenceTransformer("all-MiniLM-L6-v2")

sample_bohr = np.random.choice(lines_bohr, min(len(lines_bohr), 200), replace=False)
sample_einstein = np.random.choice(lines_einstein, min(len(lines_einstein), 200), replace=False)

mean_emb_bhor = embedder.encode(sample_bohr.tolist(), convert_to_numpy=True).mean(axis=0)
mean_emb_einstein = embedder.encode(sample_einstein.tolist(), convert_to_numpy=True).mean(axis=0)

mean_emb = (mean_emb_bhor + mean_emb_einstein) / 2

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Auxiliary function to compute internal repetition score
def repetition_score(text):
    sentences = [s.strip() for s in text.split('.') if len(s.strip().split()) > 3]
    if len(sentences) < 2:
        return 0.0
    tfidf = TfidfVectorizer().fit_transform(sentences)
    sim_matrix = cosine_similarity(tfidf)
    n = len(sentences)
    repetition = (sim_matrix.sum() - n) / (n * (n - 1))
    return repetition

# Main evaluation function
def evaluate_model_metrics(model_fn, model_name="CustomModel", verbose=False):
    """
    Evaluate textual quality and semantic relevance of a generative model.

    Args:
        model_fn (function): A function that takes a query string and returns a generated response.
        model_name (str): Optional name for the model (used in output).
        verbose (bool): If True, prints each query and generated response.

    Returns:
        dict: Dictionary of average evaluation metrics.
    """
    records = []

    for query in queries[:10]:
        start = time.time()
        response = model_fn(query)
        if response is None:
            print(f"[ERROR] No valid response for query: {query}")
            continue

        gen_time = time.time() - start

        if verbose:
            print(f"\n[Query] {query}\n[Response] {response}\n")

        tokens = response.split()
        length = len(tokens)
        lex_div = len(set(tokens)) / length if length else 0
        bigrams = list(zip(tokens, tokens[1:]))
        distinct1 = len(set(tokens)) / length if length else 0
        distinct2 = len(set(bigrams)) / len(bigrams) if bigrams else 0
        sentences = [s.strip() for s in response.split('.') if s.strip()]
        sent_lens = [len(s.split()) for s in sentences]
        avg_sent_len = np.mean(sent_lens) if sent_lens else 0

        # Style similarity to Einstein corpus
        resp_emb = embedder.encode([response], convert_to_numpy=True)[0]
        style_sim = float(np.dot(resp_emb, mean_emb) / (np.linalg.norm(resp_emb) * np.linalg.norm(mean_emb)))

        # Semantic relevance (query-to-response similarity)
        query_emb = embedder.encode([query], convert_to_numpy=True)[0]
        query_sim = float(np.dot(query_emb, resp_emb) / (np.linalg.norm(query_emb) * np.linalg.norm(resp_emb)))

        # Internal repetition
        repetition = repetition_score(response)

        records.append({
            "Query": query,
            "Tokens": length,
            "LexDiv": lex_div,
            "Distinct-1": distinct1,
            "Distinct-2": distinct2,
            "AvgSentLen": avg_sent_len,
            "StyleSim": style_sim,
            "QuerySim": query_sim,
            "Repetition": repetition,
            "GenTime(s)": round(gen_time, 3)
        })

    df = pd.DataFrame(records)
    metric_cols = ["Tokens", "LexDiv", "Distinct-1", "Distinct-2", "AvgSentLen",
                   "StyleSim", "QuerySim", "Repetition", "GenTime(s)"]
    summary = df[metric_cols].mean().to_dict()

    print(f"\n=== Metrics for {model_name} ===")
    for metric, value in summary.items():
        print(f"{metric}: {value:.4f}")

    return summary


The baseline is a generator which is trained just on its own parameters and not referring to files that, in other models, we retrieve, so the baseline answers out of its own pretraining.

In [None]:
!pip install faiss-cpu

### BASELINE CHATBOT

In [None]:
import pandas as pd
import numpy as np
import time
import random
from IPython.display import display
from sentence_transformers import SentenceTransformer
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import faiss

# Load FAISS index
print("[EVAL] Building FAISS index...")
passage_embeddings_einstein = embedder.encode(lines_einstein, convert_to_numpy=True, show_progress_bar=True)
passage_embeddings_bohr = embedder.encode(lines_bohr, convert_to_numpy=True, show_progress_bar=True)
passage_embeddings = np.vstack([passage_embeddings_einstein, passage_embeddings_bohr])

index = faiss.IndexFlatL2(passage_embeddings.shape[1])
index.add(passage_embeddings)
print("[EVAL] FAISS index ready.")

baseline_tokenizer = AutoTokenizer.from_pretrained("gpt2")
baseline_model     = AutoModelForCausalLM.from_pretrained("gpt2")
baseline_pipe      = pipeline(
    "text-generation",
    model=baseline_model,
    tokenizer=baseline_tokenizer,
    device=-1
)

def baseline_chatbot(prompt: str, max_new_tokens: int = 50):
    full = f"You are a helpful assistant. Answer succinctly:\n\nQuestion: {prompt}\nAnswer:"
    out = baseline_pipe(
        full,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=0.9,
        temperature=0.8,
        pad_token_id=baseline_tokenizer.eos_token_id
    )[0]["generated_text"]
    return out.split("Answer:")[-1].strip()


evaluate_model_metrics(baseline_chatbot, model_name="BASELINE", verbose=True)

# MODEL 5: GPT-Neo 1.3B with Dynamic Context Truncation

In this fifth model, we introduce a refined version of the GPT-Neo-based RAG pipeline. Instead of simply scaling the generator up, this version prioritizes *efficient use of the input token window* by dynamically adapting how much context to include for each query. We also adopt a more detailed, structured prompt to guide answer style and length.

1) **Corpus and Embedding (unchanged)**  
We reuse the same cleaned Einstein corpus and embed each paragraph using the `all-MiniLM-L6-v2` SentenceTransformer model. All embeddings are indexed using FAISS for fast nearest-neighbor search via L2 distance.

2) **Generator Upgrade: GPT-Neo 1.3B**  
We switch from GPT-Neo 2.7B to the lighter `gpt-neo-1.3B` model. While smaller, this version still provides high-quality outputs and is more manageable in environments with limited GPU memory (e.g., Colab).

3) **Smart Context Assembly**  
Instead of blindly retrieving `k=7` passages and concatenating all of them, we build the input **dynamically**, one paragraph at a time. After each addition, we test whether the resulting prompt (including the final question and instructions) still fits within the model’s token limit. This ensures we use *as much relevant context as possible* without truncation, which is especially important for long-generation models.

4) **Prompting Strategy**  
The prompt tells the model to act as Albert Einstein and provide a clear, thoughtful, and concise answer based on the retrieved "excerpts." It explicitly instructs the model *not to mention the excerpts*, which helps avoid meta comments. The target answer length is 3–5 sentences.

5) **Generation Details**  
We sample up to 400 new tokens using `top_p` sampling with moderate temperature, and apply a `no_repeat_ngram_size` of 3 to prevent redundant phrasing.

6) **Post-processing**  
We extract the answer from the generated output after the “Answer:” token, clean it up, and trim trailing content after known markers (like “---” or repeated prompt sections). We also ensure the answer ends with a full stop and makes grammatical sense.

**Key Differences from Model 4:**
- **Smaller model (1.3B vs 2.7B)**, better for memory-constrained environments.
- **Dynamic context selection** ensures optimal use of token budget with no waste or hard truncation.
- **Stronger prompt structure** guides both tone and content.
- **Improved repetition control** using `no_repeat_ngram_size`.

This model finds a sweet spot between resource efficiency and generation quality. Thanks to smarter context management and focused prompting, it often produces answers that are just as good—if not better—than its larger predecessor.


In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
import os
import time
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Step 1: Load the cleaned corpus
print("[1/8] Loading corpus...")
corpus_path = 'einstein_cleaned_final.txt'
with open(corpus_path, 'r', encoding='utf-8') as f:
    passages = [line.strip() for line in f if line.strip()]
print(f"[1/8] Corpus loaded: {len(passages)} passages\n")

# Step 2: Initialize the SentenceTransformer embedder
print("[2/8] Initializing SentenceTransformer...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print("[2/8] Embedder ready\n")

# Step 3: Compute passage embeddings and build FAISS index
print("[3/8] Computing embeddings and indexing...")
batch_size = 64
emb_chunks = []
start = time.time()
for i in range(0, len(passages), batch_size):
    batch = passages[i:i+batch_size]
    embs = embedder.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    emb_chunks.append(embs)
    print(f"    [3/8] Batch {i//batch_size+1}/{(len(passages)-1)//batch_size+1} completed")
passage_embeddings = np.vstack(emb_chunks)
dim = passage_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(passage_embeddings)
print(f"[3/8] Indexed {passage_embeddings.shape[0]} vectors in {time.time()-start:.1f}s\n")

# Step 4: Load the GPT-Neo 1.3B model and tokenizer
print("[4/8] Initializing tokenizer and GPT-Neo (1.3B)...")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
generator_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator_model.to(device)
print(f"[4/8] Model and tokenizer ready (device={device})\n")

# Step 5–7: Define the RAG pipeline using GPT-Neo 1.3B
def einstein_chatbot(query, k=7, max_new_tokens=400, top_p=0.8, temperature=0.7):
    print(f"\n[5/8] Processing query: {query!r}")

    q_emb = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, k)

    total_tokens = 0
    context_chunks = []

    # Dynamically accumulate context passages until input length limit
    for idx in I[0]:
        para_text = passages[idx][:1500]
        test_prompt = (
            f"You are Albert Einstein. Based on the following excerpts, answer the question in your own words. "
            "Provide a clear, thoughtful, and concise answer in 3–5 complete sentences. "
            "Do not refer to the excerpts or sources. Just answer directly.\n\n"
            "Excerpts:\n" + "\n---\n".join(context_chunks + [para_text]) + f"\n\nQuestion: {query}\nAnswer:"
        )
        token_count = len(tokenizer.encode(test_prompt))

        if token_count < tokenizer.model_max_length - max_new_tokens:
            context_chunks.append(para_text)
            total_tokens = token_count
        else:
            break

    print(f"[5/8] Collected {len(context_chunks)} context passages (approx. {total_tokens} tokens)\n")

    context_text = "\n---\n".join(context_chunks)
    prompt = (
        f"You are Albert Einstein. Based on the following excerpts, answer the question in your own words. "
        "Provide a clear, thoughtful, and concise answer in 3–5 complete sentences. "
        "Do not refer to the excerpts or sources. Just answer directly.\n\n"
        "Excerpts:\n" + context_text + f"\n\nQuestion: {query}\nAnswer:"
    )

    max_input_tokens = tokenizer.model_max_length
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=max_input_tokens).to(device)
    print(f"[6/8] Prompt ready (tokens={input_ids.shape[1]}/{max_input_tokens})")

    output_ids = generator_model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        temperature=temperature,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3
    )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract only the answer portion
    if "Answer:" in generated_text:
        answer = generated_text.split("Answer:")[-1].strip()
    else:
        answer = generated_text.strip()

    for stop_token in ["---", "\n\n", "\nAnswer:"]:
        if stop_token in answer:
            answer = answer.split(stop_token)[0].strip()
    if '.' in answer:
        answer = '.'.join(answer.split('.')[:-1]) + '.'

    print("[7/8] Answer generated")
    return answer

# Step 8: Run example queries
example_queries = [
    "What is the nature of time?",
    "How would you describe imagination?",
    "What is the nature of light?"
]

for q in example_queries:
    print(f"\n=============================\nQuery: {q}")
    answer = einstein_chatbot(q)
    print(f"\nAnswer:\n{answer}")
    print("=============================")


In [None]:
evaluate_model_metrics(einstein_chatbot, model_name="GPT-NEO 2", verbose=True)

# MODEL 5: GPT-Neo 1.3B with Dynamic Context Truncation

In this fifth model, we introduce a refined version of the GPT-Neo-based RAG pipeline. Instead of simply scaling the generator up, this version prioritizes *efficient use of the input token window* by dynamically adapting how much context to include for each query. We also adopt a more detailed, structured prompt to guide answer style and length.

1) **Corpus and Embedding (unchanged)**  
We reuse the same cleaned Bohr corpus and embed each paragraph using the `all-MiniLM-L6-v2` SentenceTransformer model. All embeddings are indexed using FAISS for fast nearest-neighbor search via L2 distance.

2) **Generator Upgrade: GPT-Neo 1.3B**  
We switch from GPT-Neo 2.7B to the lighter `gpt-neo-1.3B` model. While smaller, this version still provides high-quality outputs and is more manageable in environments with limited GPU memory (e.g., Colab).

3) **Smart Context Assembly**  
Instead of blindly retrieving `k=7` passages and concatenating all of them, we build the input **dynamically**, one paragraph at a time. After each addition, we test whether the resulting prompt (including the final question and instructions) still fits within the model’s token limit. This ensures we use *as much relevant context as possible* without truncation, which is especially important for long-generation models.

4) **Prompting Strategy**  
The prompt tells the model to act as Bohr and provide a clear, thoughtful, and concise answer based on the retrieved "excerpts." It explicitly instructs the model *not to mention the excerpts*, which helps avoid meta comments. The target answer length is 3–5 sentences.

5) **Generation Details**  
We sample up to 400 new tokens using `top_p` sampling with moderate temperature, and apply a `no_repeat_ngram_size` of 3 to prevent redundant phrasing.

6) **Post-processing**  
We extract the answer from the generated output after the “Answer:” token, clean it up, and trim trailing content after known markers (like “---” or repeated prompt sections). We also ensure the answer ends with a full stop and makes grammatical sense.

**Key Differences from Model 4:**
- **Smaller model (1.3B vs 2.7B)**, better for memory-constrained environments.
- **Dynamic context selection** ensures optimal use of token budget with no waste or hard truncation.
- **Stronger prompt structure** guides both tone and content.
- **Improved repetition control** using `no_repeat_ngram_size`.

This model finds a sweet spot between resource efficiency and generation quality. Thanks to smarter context management and focused prompting, it often produces answers that are just as good—if not better—than its larger predecessor.


In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

In [None]:
import os
import time
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Step 1: Load the cleaned corpus
print("[1/8] Loading corpus...")
corpus_path = 'bohr_cleaned_final.txt'
with open(corpus_path, 'r', encoding='utf-8') as f:
    passages = [line.strip() for line in f if line.strip()]
print(f"[1/8] Corpus loaded: {len(passages)} passages\n")

# Step 2: Initialize the SentenceTransformer embedder
print("[2/8] Initializing SentenceTransformer...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print("[2/8] Embedder ready\n")

# Step 3: Compute passage embeddings and build FAISS index
print("[3/8] Computing embeddings and indexing...")
batch_size = 64
emb_chunks = []
start = time.time()
for i in range(0, len(passages), batch_size):
    batch = passages[i:i+batch_size]
    embs = embedder.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    emb_chunks.append(embs)
    print(f"    [3/8] Batch {i//batch_size+1}/{(len(passages)-1)//batch_size+1} completed")
passage_embeddings = np.vstack(emb_chunks)
dim = passage_embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(passage_embeddings)
print(f"[3/8] Indexed {passage_embeddings.shape[0]} vectors in {time.time()-start:.1f}s\n")

# Step 4: Load the GPT-Neo 1.3B model and tokenizer
print("[4/8] Initializing tokenizer and GPT-Neo (1.3B)...")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
generator_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
generator_model.to(device)
print(f"[4/8] Model and tokenizer ready (device={device})\n")

# Step 5–7: Define the RAG pipeline using GPT-Neo 1.3B
def bohr_chatbot(query, k=7, max_new_tokens=400, top_p=0.8, temperature=0.7):
    print(f"\n[5/8] Processing query: {query!r}")

    q_emb = embedder.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, k)

    total_tokens = 0
    context_chunks = []

    # Dynamically accumulate context passages until input length limit
    for idx in I[0]:
        para_text = passages[idx][:1500]
        test_prompt = (
            f"You are Niels Bohr. Based on the following excerpts, answer the question in your own words. "
            "Provide a clear, thoughtful, and concise answer in 3–5 complete sentences. "
            "Do not refer to the excerpts or sources. Just answer directly.\n\n"
            "Excerpts:\n" + "\n---\n".join(context_chunks + [para_text]) + f"\n\nQuestion: {query}\nAnswer:"
        )
        token_count = len(tokenizer.encode(test_prompt))

        if token_count < tokenizer.model_max_length - max_new_tokens:
            context_chunks.append(para_text)
            total_tokens = token_count
        else:
            break

    print(f"[5/8] Collected {len(context_chunks)} context passages (approx. {total_tokens} tokens)\n")

    context_text = "\n---\n".join(context_chunks)
    prompt = (
        f"You are Niels Bohr. Based on the following excerpts, answer the question in your own words. "
        "Provide a clear, thoughtful, and concise answer in 3–5 complete sentences. "
        "Do not refer to the excerpts or sources. Just answer directly.\n\n"
        "Excerpts:\n" + context_text + f"\n\nQuestion: {query}\nAnswer:"
    )

    max_input_tokens = tokenizer.model_max_length
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=max_input_tokens).to(device)
    print(f"[6/8] Prompt ready (tokens={input_ids.shape[1]}/{max_input_tokens})")

    output_ids = generator_model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        temperature=temperature,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3
    )

    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Extract only the answer portion
    if "Answer:" in generated_text:
        answer = generated_text.split("Answer:")[-1].strip()
    else:
        answer = generated_text.strip()

    for stop_token in ["---", "\n\n", "\nAnswer:"]:
        if stop_token in answer:
            answer = answer.split(stop_token)[0].strip()
    if '.' in answer:
        answer = '.'.join(answer.split('.')[:-1]) + '.'

    print("[7/8] Answer generated")
    return answer

# Step 8: Run example queries
example_queries = [
    "What is light made of?",
    "Can we really know what an atom looks like?",
    "Do we change things just by looking at them?",
]

for q in example_queries:
    print(f"\n=============================\nQuery: {q}")
    answer = bohr_chatbot(q)
    print(f"\nAnswer:\n{answer}")
    print("=============================")


In [None]:
evaluate_model_metrics(bohr_chatbot, model_name="GPT-NEO 2", verbose=True)

In [None]:
# Generate answers for each question
baseline_outputs = [baseline_chatbot(q) for q in queries]
einstein_outputs = [einstein_chatbot(q) for q in queries]
bohr_outputs = [bohr_chatbot(q) for q in queries]


In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

### DATAFRAME 1 ###
records = []
for i, q in enumerate(queries):
    records.append([q, einstein_outputs[i], "EINSTEIN"])
    records.append([q, bohr_outputs[i], "BOHR"])
    records.append([q, baseline_outputs[i], "BASELINE"])
df1 = pd.DataFrame(records, columns=["question", "response", "label"])

df1


In [None]:
### EVALUATION METRICS ###
def compute_style_sim(ans):
    ans_emb = embedder.encode(ans)
    sim_E = cosine_similarity([ans_emb], [mean_emb_einstein])[0][0]
    sim_B = cosine_similarity([ans_emb], [mean_emb_bhor])[0][0]
    return sim_E, sim_B

def compute_query_sim(q, a):
    q_emb = embedder.encode(q)
    a_emb = embedder.encode(a)
    return cosine_similarity([q_emb], [a_emb])[0][0]

def compute_repetition(text):
    sents = [s.strip() for s in re.split(r'[.?!]', text) if s.strip()]
    if len(sents) <= 1:
        return 0.0
    tfidf = TfidfVectorizer().fit_transform(sents).toarray()
    sims = cosine_similarity(tfidf)
    np.fill_diagonal(sims, 0)
    return sims.max()



In [None]:
### DATAFRAME 2 ###
metrics = []
for i, row in df1.iterrows():
    q = row["question"]
    a = row["response"]
    sim_e, sim_b = compute_style_sim(a)
    qsim = compute_query_sim(q, a)
    rep = compute_repetition(a)
    metrics.append([sim_e, sim_b, qsim, rep])
df2 = pd.DataFrame(metrics, columns=["StyleSim_E", "StyleSim_B", "QuerySim", "Repetition"])

df2

In [None]:
### CLASSIFIER ###
X = df2
y = df1["label"]
clf = LogisticRegression(max_iter=1000).fit(X, y)
y_pred = clf.predict(X)



In [None]:
### DATAFRAME 3 ###
df3 = pd.DataFrame({
    "classificato_come": y_pred,
    "etichetta_reale": y
})

### RESULTS ###
print("=== CLASSIFICATION REPORT ===")
print(classification_report(y, y_pred))