In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util,CrossEncoder

In [74]:
# Setup device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load bi-encoder for fast retrieval
bi_encoder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Load cross-encoder for reranking
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2', device=device)

# -------------------------
# Load Small Evaluator Model (FLAN-T5)
# -------------------------

evaluator_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base").to(device)
evaluator_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

# -------------------------
# Example Job Docs (RAG)
# -------------------------
job_docs = [
    "Python scripting, automation, data analysis using Pandas, NumPy, Matplotlib.",
    "Experience with machine learning frameworks like scikit-learn, TensorFlow, Keras.",
    "Object-oriented programming, version control, API interaction."
]

# -------------------------
# Questions & Answers
# -------------------------
questions = [
    "Can you tell me about your experience with Python?",
    "Describe your experience with machine learning.",
    "How do you approach debugging complex software issues?"
]

candidate_answers = [
    # Q1 Answer
    """My experience with Python is quite extensive, spanning several years across various domains. 
    I've used Python for scripting, automation, and data analysis, leveraging libraries like Pandas, NumPy, and Matplotlib. 
    I also have experience with web frameworks like Flask and Django for building RESTful APIs. 
    My work often involves writing clean, maintainable code and adhering to best practices in software development. 
    Additionally, I am familiar with version control systems like Git and have contributed to open-source projects.""",

    # Q2 Answer
    """I’ve worked on multiple ML projects. One involved customer churn prediction using Random Forest and XGBoost.
    I preprocessed data with Pandas, engineered features, and used scikit-learn for model training.""",
    
    # Q3 Answer
    """I first reproduce the issue, review logs, and isolate the failing component. 
    Then I use tools like `pdb`, print statements, and logging. 
    If it's async or multi-threaded, I use `threading` and `concurrent.futures` to track flow.
      I write unit tests to prevent regression."""
]

In [75]:
# -------------------------
# RAG Function
# -------------------------

# -------- Retrieval Function: Bi-Encoder + Cross-Encoder Reranking --------
def retrieve_reranked_context(query, docs, top_k=2):

    # Step 1: Embed query + docs using bi-encoder
    query_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    doc_embeddings = bi_encoder.encode(docs, convert_to_tensor=True)

    # Step 2: Get top-k based on cosine similarity
    cos_scores  = util.pytorch_cos_sim(query_embedding, doc_embeddings)[0]
    top_results = torch.topk(cos_scores , k=min(top_k * 3, len(docs)))

    # Step 3: Rerank using cross-encoder
    pairs = [(query, docs[idx]) for idx in top_results.indices]
    rerank_scores = cross_encoder.predict(pairs)

    # Step 4: Sort by rerank scores
    reranked = sorted(zip(pairs, rerank_scores), key=lambda x: x[1], reverse=True)
    top_contexts = [doc for (_, doc), _ in reranked[:top_k]]

    return ' '.join(top_contexts)

In [76]:
# -------- Evaluation Function --------

def evaluate_with_flan(question, answer, context):
    prompt = f"""
    
You are a professional technical interviewer.

You are given:
- A job Description
- An interview question
- A candidate's answer

Job Description: {context}
Interview Question: {question}
Candidate Answer: {answer}

Evaluate the candidate's answer based on relevance, depth, and accuracy.
Respond ONLY in the following format:

Comment: <1-2 sentence feedback on the answer quality>
Score: <integer from 1 (poor) to 10 (excellent)>

"""

    inputs = evaluator_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).to(device)
    
    output = evaluator_model.generate(
        **inputs, 
        max_new_tokens=180, # means we will generate up to 200 new tokens
        # num_beams=4,   # means we will generate 4 different sequences and pick the best one
        # early_stopping=True,    #means we will stop generating when we reach the end of the sequence
        # do_sample=False,    # means we will not sample from the distribution, but take the most likely sequence
        )
    
    decoded=evaluator_tokenizer.decode(output[0], skip_special_tokens=True)

    print (f"Decoded Output: {decoded}")

    # Post-process to extract comment and score

    comment, score = "", ""
    
    for line in decoded.strip().splitlines():
        if line.lower().startswith("comment:"):
            comment = line.split(":", 1)[-1].strip()
        elif line.lower().startswith("score:"):
            score = line.split(":", 1)[-1].strip()

    return comment, score

In [77]:
# -------------------------
# Main Evaluation Loop
# -------------------------
results = []


for q, a in zip(questions, candidate_answers):
    ctx = retrieve_reranked_context(q, job_docs)
    comment, score = evaluate_with_flan(q, a, ctx)

    results.append({
        "question": q,
        "answer": a,
        "context": ctx,
        "comment": comment,
        "score": score     
    })


Decoded Output: 10
Decoded Output: 10
Decoded Output: 10


In [78]:

# -------------------------
# Save to JSON
# -------------------------

with open("interview_results.json", "w") as f:
    json.dump(results, f, indent=2)

print("✅ Interview evaluation complete. Results saved to 'interview_results.json'")


✅ Interview evaluation complete. Results saved to 'interview_results.json'


In [79]:
# -------- Display Results --------

for idx, res in enumerate(results, 1):
    print("=" * 60)
    print(f"🔢 Question {idx}: {res['question']}\n")
    print(f"📝 Candidate Answer:\n{res['answer'].strip()}\n")
    print(f"📄 Retrieved Context:\n{res['context'].strip()}\n")
    print(f"🧠 Evaluation Comment:\n{res['comment'].strip()}\n")
    print(f"⭐ Score: {res['score']}\n")
    print("=" * 60)

🔢 Question 1: Can you tell me about your experience with Python?

📝 Candidate Answer:
My experience with Python is quite extensive, spanning several years across various domains. 
    I've used Python for scripting, automation, and data analysis, leveraging libraries like Pandas, NumPy, and Matplotlib. 
    I also have experience with web frameworks like Flask and Django for building RESTful APIs. 
    My work often involves writing clean, maintainable code and adhering to best practices in software development. 
    Additionally, I am familiar with version control systems like Git and have contributed to open-source projects.

📄 Retrieved Context:
Python scripting, automation, data analysis using Pandas, NumPy, Matplotlib. Experience with machine learning frameworks like scikit-learn, TensorFlow, Keras.

🧠 Evaluation Comment:


⭐ Score: 

🔢 Question 2: Describe your experience with machine learning.

📝 Candidate Answer:
I’ve worked on multiple ML projects. One involved customer churn 