In [38]:
# importing all the essentials 

import os
import json
import re
import numpy as np
import faiss
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer

# Load env and fetching your api key
load_dotenv()
GROQ_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_KEY:
    print("Warning: GROQ_API_KEY not found in .env. Add it to call the LLM later.")


In [39]:
ROOT = ".."  # run notebook from project root 
M1 = os.path.join(ROOT, "Aayush_milestone_1")

# Paths for FAISS , metadata and logs 

INDEX_PATH = os.path.join(M1, "outputs", "visa_index.faiss")
METADATA_PATH = os.path.join(M1, "outputs", "visa_metadata.json")

LOG_PATH = os.path.join(ROOT, "Aayush_milestone_2", "decision_history.jsonl")

EMBED_MODEL = "all-MiniLM-L6-v2"
LLM_MODEL = "llama-3.1-8b-instant"

# shows the paths on screen 

print("Using paths:")
print(" INDEX:", INDEX_PATH)
print(" META:", METADATA_PATH)
print(" LOGS:", LOG_PATH)

Using paths:
 INDEX: ..\Aayush_milestone_1\outputs\visa_index.faiss
 META: ..\Aayush_milestone_1\outputs\visa_metadata.json
 LOGS: ..\Aayush_milestone_2\decision_history.jsonl


In [40]:
embedder = SentenceTransformer(EMBED_MODEL)

def embed_text(text):
    v = embedder.encode([text])[0].astype("float32")
    return v


In [41]:
def load_index(index_path=INDEX_PATH, meta_path=METADATA_PATH):
    if os.path.exists(index_path) and os.path.exists(meta_path):
        print("Loading existing FAISS index and metadata...")
        idx = faiss.read_index(index_path)
        with open(meta_path, "r", encoding="utf-8") as f:
            meta = json.load(f)
        return idx, meta

    raise FileNotFoundError(
        "FAISS index or metadata not found. "
        "Make sure visa_index.faiss and visa_metadata.json exist in Aayush_milestone_1/outputs/"
    )


In [42]:
def retrieve(index, meta, query, k=5):
    qv = embed_text(query)
    D, I = index.search(np.array([qv]), k)
    results = []
    for cid in I[0]:
        if cid == -1:
            continue
        if cid < 0 or cid >= len(meta):
            continue
        results.append(meta[cid])
    return results


In [43]:
def ask_groq(question, chunks, llm_model=LLM_MODEL):
    try:
        from groq import Groq
    except Exception as e:
        raise ImportError("groq package not installed. Install with `pip install groq`") from e

    client = Groq(api_key=os.getenv("GROQ_API_KEY"))
    ctx = "\n\n".join(c["text"] for c in chunks)
    prompt = f"""
You are a visa eligibility officer.
Answer ONLY using the PDF context provided.
Do NOT add any information that is not present in the context.

Question:
{question}

Context:
{ctx}

Return EXACTLY the following format:

Eligibility: Yes / No / Partial
Final Answer: (2–3 lines summary ONLY)
Explanation:
- Provide EXACTLY 3 to 5 bullet points.
- Each bullet MUST be unique.
- No repeating, rephrasing, or expanding the same idea.
Confidence: (0 to 1)
"""
    resp = client.chat.completions.create(
        model=llm_model,
        messages=[{"role":"user","content":prompt}],
        temperature=0.0
    )
    return resp.choices[0].message.content


In [44]:
def normalize_confidence(ans_text):
    m = re.search(r"Confidence:\s*([0-9]*\.?[0-9]+)", ans_text)
    try:
        confidence = float(m.group(1)) if m else None
    except:
        confidence = None
    lower = ans_text.lower()
    if "eligibility: yes" in lower:
        return min(0.9, confidence or 0.9)
    if "eligibility: no" in lower:
        return min(0.6, confidence or 0.6)
    return 0.3 if confidence is None else confidence

def overwrite_confidence(ans_text, conf):
    return re.sub(r"Confidence:\s*[0-9]*\.?[0-9]+", f"Confidence: {conf}", ans_text)


In [45]:
index, metadata = load_index()
print("Index loaded. Total chunks in metadata:", len(metadata))


Loading existing FAISS index and metadata...
Index loaded. Total chunks in metadata: 16


In [46]:
applicant_name = input("Enter applicant name: ").strip()
question = input(f"Hello {applicant_name}, what is your visa question? ").strip()

if not question:
    raise SystemExit("Please set a non-empty question string and re-run this cell.")

# Retrieve chunks
chunks = retrieve(index, metadata, question, k=5)
print(f"Retrieved {len(chunks)} chunks.")

# Safe preview check
if len(chunks) > 0:
    print("Preview (first chunk text snippet):\n")
    print(chunks[0]["text"][:800], "\n\n---\n")
else:
    print("No relevant chunks found. Your PDF doesn't contain this information.")


Retrieved 5 chunks.
Preview (first chunk text snippet):

CANADA — VISA / TEMPORARY RESIDENT (VISITOR) GUIDELINES (FULL TEXT) Purpose Guidance for applying for a Temporary Resident Visa (Visitor Visa) to Canada, covering who must apply, eligibility, documentation and procedural steps for visa officers and applicants. Eligibility Criteria Applicant must have a valid travel document (passport) and must be admissible to Canada. Applicant must satisfy the visa officer that they will leave Canada at the end of their authorized stay. Applicant must have sufficient funds to support themselves and any dependents during the visit (unless sponsored). Applicant must have no criminal inadmissibility issues and meet health requirements as applicable. Applicant must demonstrate ties to their home country (employment, family, property, studies) to establish int 

---



In [47]:
if GROQ_KEY:
    model_answer = ask_groq(question, chunks)
    conf = normalize_confidence(model_answer)
    model_answer_clean = overwrite_confidence(model_answer, conf)
    print("---- MODEL ANSWER ----\n")
    print(model_answer_clean)
else:
    print("GROQ_API_KEY not found. Skipping LLM call. You can still test retrieval output.")


AuthenticationError: Error code: 401 - {'error': {'message': 'Invalid API Key', 'type': 'invalid_request_error', 'code': 'invalid_api_key'}}

In [48]:
if 'model_answer_clean' in globals():
    os.makedirs(os.path.dirname(LOG_PATH) or ".", exist_ok=True)
    log_entry = {
        "question": question,
        "answer": model_answer_clean,
        "confidence": normalize_confidence(model_answer_clean)
    }
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
    print("Logged to", LOG_PATH)
else:
    print("No model answer to log. If you only tested retrieval, that's fine.")


No model answer to log. If you only tested retrieval, that's fine.
