In [1]:
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
from typing import List, Dict
from sklearn.metrics.pairwise import cosine_similarity
import whisper
import sounddevice as sd
import soundfile as sf
from pydub import AudioSegment
import io
import time
import speech_recognition as sr
from googletrans import Translator
from gtts import gTTS
import playsound
import os
import tempfile
import pygame
import chromadb
from chromadb.config import Settings
from transformers import AutoModelForCausalLM, AutoTokenizer
from difflib import SequenceMatcher
import json
import time
import re
from chromadb import Settings, Client



pygame 2.6.1 (SDL 2.28.4, Python 3.12.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
#=======================
# Device Configuration
# =======================
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"

# =======================
# ChromaDB Setup
# =======================
chroma_client = chromadb.Client(Settings(
    persist_directory="./chroma_data"
))
jd_collection = chroma_client.get_or_create_collection("job_descriptions")
resume_collection = chroma_client.get_or_create_collection("resumes")

# Initialize models
embedding_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device='cpu')

# Qwen setup

#Initialize model and tokenizer
model_name = "Qwen/Qwen-1_8B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).eval()
# Initialize Whisper
whisper_model = whisper.load_model("base")

model = model.to('cpu')  # Add this after model initialization

# Define Qwen-Chat template (not set by default)
qwen_chat_template = """{% for message in messages %}
{% if message['role'] == 'user' %}<|im_start|>user
{{ message['content'] }}<|im_end|>
{% elif message['role'] == 'system' %}<|im_start|>system
{{ message['content'] }}<|im_end|>
{% elif message['role'] == 'assistant' %}<|im_start|>assistant
{{ message['content'] }}<|im_end|>
{% endif %}
{% endfor %}"""
tokenizer.chat_template = qwen_chat_template

# =======================
# Sample Job Descriptions (Will be stored in ChromaDB)
# =======================
SAMPLE_JDs = [
    {
        "id": "jd1",
        "title": "AC Technician",
        "text": "Install and repair AC units. Must lift 20kg and work at heights. HVAC certification preferred.",
        "questions": [],
        "embedding": None
    },
    {
        "id": "jd2",
        "title": "Construction Worker",
        "text": "Perform construction tasks including carrying materials and operating tools. Must work outdoors in all weather.",
        "questions": [],
        "embedding": None
    },
    {
        "id": "jd3",
        "title": "Electrician",
        "text": "Install and maintain electrical systems. Must understand wiring diagrams and safety protocols. Certification required.",
        "questions": [],
        "embedding": None
    },
    {
        "id": "jd4",
        "title": "Plumber",
        "text": "Install and repair plumbing systems. Knowledge of PVC, copper piping required. Must have own tools.",
        "questions": [],
        "embedding": None
    },
    {
        "id": "jd5",
        "title": "Carpenter",
        "text": "Build and repair wooden structures. Must be proficient with power tools and read blueprints.",
        "questions": [],
        "embedding": None
    },
    {
        "id": "jd6",
        "title": "Auto Mechanic",
        "text": "Diagnose and repair vehicles. Must know engine systems and use diagnostic tools. ASE certification preferred.",
        "questions": [],
        "embedding": None
    },
    {
        "id": "jd7",
        "title": "Welder",
        "text": "Join metal components using various welding techniques. Must provide safety gear and pass skill test.",
        "questions": [],
        "embedding": None
    },
    {
        "id": "jd8",
        "title": "Painter",
        "text": "Prepare surfaces and apply coatings. Knowledge of color mixing and surface preparation required.",
        "questions": [],
        "embedding": None
    },
    {
        "id": "jd9",
        "title": "Mason",
        "text": "Build structures with bricks, concrete blocks. Must understand mortar mixing and structural principles.",
        "questions": [],
        "embedding": None
    },
    {
        "id": "jd10",
        "title": "HVAC Technician",
        "text": "Install and maintain heating/cooling systems. EPA certification required. Must understand refrigeration cycles.",
        "questions": [],
        "embedding": None
    }
]


# Initialize ChromaDB with sample JDs if empty
def initialize_chroma_jds():
    if jd_collection.count() == 0:
        for jd in SAMPLE_JDs:
            # Compute embedding
            embedding = embedding_model.encode(jd['text']).tolist()
            # Store in ChromaDB
            jd_collection.add(
                ids=[jd['id']],
                embeddings=[embedding],
                documents=[json.dumps(jd)]
            )

initialize_chroma_jds()

translator = Translator()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
def compute_embedding(text: str) -> np.ndarray:
    return embedding_model.encode(text)

def add_resume(resume_text: str, lang: str = 'en') -> str:
    embedding = compute_embedding(resume_text).tolist()
    resume_id = f"resume_{int(time.time() * 1000)}"  # Unique string ID
    resume_doc = {
        "id": resume_id,
        "text": resume_text,
        "lang": lang,
        "qa": {}
    }
    resume_collection.add(
        ids=[resume_id],
        embeddings=[embedding],
        documents=[json.dumps(resume_doc)]
    )
    return resume_id  # Return string ID



def match_jds(resume_text: str, top_k: int = 5) -> List[Dict]:
    query_embedding = compute_embedding(resume_text).tolist()
    
    # Query ChromaDB
    results = jd_collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k,
        include=["documents", "distances"]
    )
    
    # Process results
    matches = []
    for doc, distance in zip(results['documents'][0], results['distances'][0]):
        jd = json.loads(doc)
        # Convert distance to similarity score (1 - distance)
        matches.append((jd, 1 - distance))
    
    return sorted(matches, key=lambda x: -x[1])

def speak(text: str, lang: str = 'en'):
    try:
        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as fp:
            temp_path = fp.name
        tts = gTTS(text=text, lang=lang)
        tts.save(temp_path)
        pygame.mixer.init()
        pygame.mixer.music.load(temp_path)
        pygame.mixer.music.play()
        while pygame.mixer.music.get_busy():
            pygame.time.Clock().tick(10)
        pygame.mixer.music.unload()
        pygame.mixer.quit()
        os.unlink(temp_path)
    except Exception as e:
        print(f"\n[SYSTEM ERROR: TTS Failed - {str(e)}]")
        print(f"\n[SYSTEM]: {text}")

def listen(timeout=5) -> str:
    """Capture speech and return English text (translating if needed)"""
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        print("\n[Listening... Speak now]")
        try:
            audio_data = recognizer.listen(source, timeout=timeout)
            
            # First try Hindi recognition
            try:
                hindi_text = recognizer.recognize_google(audio_data, language='hi-IN')
                print(f"[Recognized Hindi]: {hindi_text}")
                
                # Translate to English if not empty
                if hindi_text.strip():
                    english_text = translator.translate(hindi_text, src='hi', dest='en').text
                    print(f"[Translated English]: {english_text}")
                    return english_text
                
            except sr.UnknownValueError:
                # Fallback to English recognition
                english_text = recognizer.recognize_google(audio_data, language='en-US')
                print(f"[Recognized English]: {english_text}")
                return english_text
                
        except sr.UnknownValueError:
            print("[Could not understand audio]")
        except sr.RequestError as e:
            print(f"[Google STT error]: {e}")
    
    return input("Type your answer instead (in English or Hindi): ")
    
import json
import re

def robust_parse_json_list(raw_output):
    import json, re
    # Find all arrays in the output
    arrays = re.findall(r'\[.*?\]', raw_output, re.DOTALL)
    questions = []
    for arr in arrays:
        try:
            parsed = json.loads(arr.replace("'", '"').replace(",]", "]"))
            # If it's a list of dicts with 'question'
            if isinstance(parsed, list) and all(isinstance(x, dict) and 'question' in x for x in parsed):
                questions.extend(x['question'].strip() for x in parsed)
            # If it's a list of strings
            elif isinstance(parsed, list) and all(isinstance(x, str) for x in parsed):
                questions.extend(x.strip() for x in parsed)
        except Exception:
            continue
    return questions


def extract_numbered_questions(text):
    """Fallback extraction for numbered lists"""
    questions = []
    for line in text.split('\n'):
        line = line.strip()
        if re.match(r'^\d+\.\s+', line):
            question = re.sub(r'^\d+\.\s*', '', line).strip()
            if question and not re.match(r'Question \d+', question, re.IGNORECASE):
                questions.append(question)
    return questions[:3]  # Return max 3 questions

def generate_questions_and_update_jd(jd: dict) -> list:
    try:
        existing = jd.get('questions', [])
        existing_lower = [q.lower().strip() for q in existing]
        
        # Prompt engineering to prevent answers
        system_prompt = (
            "You are an HR interview question generator. Follow these rules:\n"
            "1. Generate exactly 3 technical questions\n"
            "2. Output ONLY a JSON array of question strings\n"
            "3. NEVER include answers or explanations\n"
            "4. Example format: [\"Question 1?\", \"Question 2?\", \"Question 3?\"]"
        )
        user_prompt = (
            f"Job Title: {jd['title']}\n"
            f"Job Description: {jd['text']}\n"
            f"Existing Questions to Avoid: {existing}\n"
            "Generate NEW questions about:\n"
            "- Safety protocols\n"
            "- System maintenance\n"
            "- Certification requirements"
        )
        
        # Qwen-Chat format
        prompt = (
            "<|im_start|>system\n" + system_prompt + "<|im_end|>\n"
            "<|im_start|>user\n" + user_prompt + "<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
        
        # Tokenize and generate
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=200,
            temperature=0.7,
            repetition_penalty=1.5,
            bad_words_ids=[[tokenizer.encode("answer")[0]]],
            pad_token_id=tokenizer.eos_token_id
        )
        
        # Process output
        raw_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
# print(f"\n[DEBUG] Raw Output:\n{raw_output}\n")
        
        # Extract questions from both formats
        questions = robust_parse_json_list(raw_output)
        
        # Final validation and duplicate filtering
        valid_questions = [
            q for q in questions
            if len(q) > 10
            and 'answer' not in q.lower()
            and q.lower() not in existing_lower
            and not re.match(r'Question \d+\??', q.strip())
        ][:3]
        
        # Update ChromaDB if new questions found
        if valid_questions:
            jd['questions'] = existing + valid_questions
            jd_collection.upsert(
                ids=[jd['id']],
                documents=[json.dumps(jd)],
                embeddings=[compute_embedding(jd['text']).tolist()]
            )
            return valid_questions
        
        return []
    except Exception as e:
        print(f"Question generation failed: {str(e)}")
        return []



def conduct_interview(resume_id: str, jd: dict):
    print(f"\n=== Interview for {jd['title']} ===")
    questions = generate_questions_and_update_jd(jd)
    if not questions:
        print("No questions generated for this interview.")
        return {}

    answers = {}
    resume = get_resume(resume_id)
    lang = resume['lang']

    for q in questions:
        print(f"\nQUESTION: {q}")
        # Ask the question (TTS or print)
        if lang != 'en':
            try:
                translated_q = translator.translate(q, src='en', dest=lang).text
                speak(translated_q, lang)
            except:
                speak(q, 'en')
        else:
            speak(q, 'en')

        # Record answer (STT or input)
        raw_answer = listen()
        if raw_answer.strip():
            try:
                if any('\u0900' <= char <= '\u097F' for char in raw_answer):
                    english_answer = translator.translate(raw_answer, src='hi', dest='en').text
                else:
                    english_answer = raw_answer
            except:
                english_answer = raw_answer
        else:
            english_answer = "No response"
        answers[q] = english_answer
        print(f"STORED ANSWER: {english_answer}")

    resume['qa'][jd['id']] = answers
    update_resume(resume_id, resume)

    with open(f"train_en.jsonl", "a") as f:
        for q, a in answers.items():
            f.write(json.dumps({"prompt": q, "completion": a}) + "\n")

    return answers



def display_resume(resume_id: str):
    """Show complete resume with Q&A from ChromaDB"""
    resume = get_resume(resume_id)
    if not resume:
        print(f"Resume with id {resume_id} not found.")
        return
    
    print("\n=== COMPLETE RESUME ===")
    print(f"\nOriginal Resume Text:\n{resume['text']}")
    
    print("\n=== INTERVIEW ANSWERS ===")
    for jd_id, qa in resume['qa'].items():
        # Get job title from ChromaDB
        jd_results = jd_collection.get(ids=[jd_id], include=["documents"])
        if jd_results["documents"]:
            jd = json.loads(jd_results["documents"][0])
            title = jd["title"]
        else:
            title = jd_id  # Fallback if JD not found
            
        print(f"\nFor Job: {title}")
        for q, a in qa.items():
            print(f"\nQ: {q}")
            print(f"A: {a}")


def get_resume(resume_id: str):
    """Retrieve a resume from ChromaDB"""
    results = resume_collection.get(ids=[resume_id], include=["documents"])
    if results["documents"]:
        return json.loads(results["documents"][0])
    return None

def update_resume(resume_id: str, resume_data: dict):
    """Update a resume in ChromaDB"""
    # Get current embedding
    results = resume_collection.get(ids=[resume_id], include=["embeddings"])
    
    # Check if embeddings exist and are non-empty
    embeddings = results.get("embeddings")
    if embeddings is not None and len(embeddings) > 0:
        # Convert numpy array to list and take first element
        embedding = embeddings[0].tolist() if isinstance(embeddings[0], np.ndarray) else embeddings[0]
    else:
        # Compute new embedding and convert to list
        embedding = compute_embedding(resume_data["text"]).tolist()
    
    # Update document and embedding
    resume_collection.update(
        ids=[resume_id],
        embeddings=[embedding],
        documents=[json.dumps(resume_data)]
    )


def extract_questions_from_text(raw_output):
    questions = []
    for line in raw_output.split('\n'):
        line = line.strip()
        if re.match(r'^\d+\.\s', line):
            q = line.split('.', 1)[-1].strip()
            if q:
                questions.append(q)
    return questions



In [11]:
if __name__ == "__main__":
    print("\n=== RESUME MATCHER WITH VOICE INTERVIEW ===")
    
    # 1. Add resume
    resume_text = input("\nEnter your resume text: ")
    lang = input("Preferred language (en/hi): ").strip() or 'en'
    resume_id = add_resume(resume_text, lang)
    
    # 2. Match jobs using ChromaDB
    print("\nFinding best job matches...")
    matches = match_jds(resume_text)
    
    print("\nTOP JOB MATCHES:")
    for i, (jd, score) in enumerate(matches, 1):
        print(f"{i}. {jd['title']} (Score: {100 + score:.2f} %)")
    
    # 3. Conduct interview
    if matches:
        selected = int(input("\nSelect job to interview for (1-5): ")) - 1
        selected_jd = matches[selected][0]
        print(f"\nStarting interview for: {selected_jd['title']}")
        
        answers = conduct_interview(resume_id, selected_jd)
        
    # 4. Show complete resume
        display_resume(resume_id)
        
   # 5.Check DB if resume is updated or not 
        updated_resume = resume_collection.get(ids=[resume_id], include=["documents"])
        print(json.loads(updated_resume["documents"][0])["qa"])



=== RESUME MATCHER WITH VOICE INTERVIEW ===



Enter your resume text:  A highly skilled and reliable AC Technician with [Number] years of experience in installing, maintaining, and repairing a wide range of air conditioning systems, including split units, central ACs, and VRF systems. Proven ability to diagnose and troubleshoot complex mechanical and electrical issues efficiently, ensuring optimal system performance and energy efficiency. Adept at performing routine maintenance, conducting safety checks, and providing excellent customer service while adhering to all relevant industry standards and safety regulations. Possesses a strong understanding of HVAC principles, refrigerant handling, and electrical wiring, coupled with a commitment to delivering quality workmanship and exceeding client expectations. EPA certified and proficient in using various diagnostic tools and equipment to ensure accurate and effective solutions.
Preferred language (en/hi):  hi



Finding best job matches...

TOP JOB MATCHES:
1. AC Technician (Score: 90.67 %)
2. Electrician (Score: 88.58 %)
3. Electrician (Score: 88.58 %)
4. HVAC Technician (Score: 87.64 %)
5. Plumber (Score: 85.73 %)



Select job to interview for (1-5):  1



Starting interview for: AC Technician

=== Interview for AC Technician ===

QUESTION: How often should the air filter be replaced in your unit?

[Listening... Speak now]
[Recognized Hindi]: पांच बार एयर फिल्टर बादल दल
[Translated English]: Five times air filter cloud party
STORED ANSWER: Five times air filter cloud party

=== COMPLETE RESUME ===

Original Resume Text:
A highly skilled and reliable AC Technician with [Number] years of experience in installing, maintaining, and repairing a wide range of air conditioning systems, including split units, central ACs, and VRF systems. Proven ability to diagnose and troubleshoot complex mechanical and electrical issues efficiently, ensuring optimal system performance and energy efficiency. Adept at performing routine maintenance, conducting safety checks, and providing excellent customer service while adhering to all relevant industry standards and safety regulations. Possesses a strong understanding of HVAC principles, refrigerant handling,