In [37]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import util



In [15]:
with open('data.json', 'r') as file:
    resume = json.load(file)
def create_documents_dynamic(resume):
    documents = []

    for key, value in resume.items():
        if isinstance(value, dict):  # Handle nested dictionary (e.g., contact information)
            section = f"{key.replace('_', ' ').title()}: " + ", ".join(
                f"{sub_key.title()}: {sub_value}" for sub_key, sub_value in value.items()
            )
            documents.append(section)

        elif isinstance(value, list):  # Handle lists (e.g., education, projects)
            for item in value:
                if isinstance(item, dict):  # Handle list of dictionaries
                    section = f"{key.replace('_', ' ').title()}:\n" + "\n".join(
                        f"{sub_key.replace('_', ' ').title()}: {sub_value}" for sub_key, sub_value in item.items()
                    )
                    documents.append(section)
                else:  # Handle plain list
                    documents.append(f"{key.replace('_', ' ').title()}: {', '.join(value)}")
        
        elif isinstance(value, str):  # Handle plain string fields
            documents.append(f"{key.replace('_', ' ').title()}: {value}")
        
        else:  # Handle unexpected formats
            documents.append(f"{key.replace('_', ' ').title()}: {value}")

    return documents

# Example Usage
documents = create_documents_dynamic(resume)
print(documents)

['Summary: A dedicated Electronics and Telecommunication professional with a strong background in machine learning, web development, and cloud platforms. Proven ability to lead projects, solve complex problems, and continuously learn new technologies to drive innovation and efficiency.', 'Contact: Name: Aneesh Patne, Email: aneeshpatne12@gmail.com, Linkedin: https://www.linkedin.com/in/aneeshpatne, Github: https://github.com/aneeshpatne, Leetcode: https://leetcode.com/aneeshpatne', 'Education:\nDegree: M.Tech in Electronics and Telecommunication\nInstitution: Veermata Jijabai Technological Institute\nLocation: Mumbai, Maharashtra\nDuration: 2023 - 2025\nDetails: Specialized in Machine Learning and Signal Processing. Relevant coursework includes Advanced Algorithms, Neural Networks, and Communication Systems.', 'Education:\nDegree: B.Tech in Electronics and Telecommunication\nInstitution: Thakur College of Engineering and Technology\nLocation: Mumbai, Maharashtra\nDuration: 2019 - 2023\

In [16]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') 

In [17]:
document_embeddings = embedding_model.encode(documents, convert_to_tensor=False, show_progress_bar=True)

Batches: 100%|██████████| 1/1 [00:00<00:00, 12.92it/s]


In [18]:
document_embeddings = np.array(document_embeddings).astype('float32')

In [None]:
res = faiss.StandardGpuResources()
index_flat = faiss.IndexFlatL2(document_embeddings.shape[1])
gpu_index = faiss.index_cpu_to_gpu(res, 0, index_flat)
gpu_index.add(document_embeddings)


In [19]:
index = faiss.IndexFlatL2(document_embeddings.shape[1])  # Using L2 distance
index.add(document_embeddings)

In [20]:
faiss.write_index(index, 'resume_index.faiss')

In [21]:
with open('documents.txt', 'w') as f:
    for doc in documents:
        f.write(doc.replace('\n', ' ') + '\n')

In [22]:
def retrieve_relevant_documents(query, top_k=3):

    query_embedding = embedding_model.encode([query], convert_to_tensor=False)
    query_embedding = np.array(query_embedding).astype('float32')
    

    index = faiss.read_index('resume_index.faiss')
    
  
    distances, indices = index.search(query_embedding, top_k)
    

    with open('documents.txt', 'r') as f:
        all_documents = f.readlines()
    

    relevant_docs = [all_documents[idx].strip() for idx in indices[0]]
    return relevant_docs


In [23]:
model_path = "D:/Llama-3.2-1B-Instruct"  # Replace with your actual path


In [25]:
model = AutoModelForCausalLM.from_pretrained(
    model_path,
).to("cpu")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [48]:
def generate_response(query):
    # Retrieve relevant documents
    relevant_docs = retrieve_relevant_documents(query)
    
    if not relevant_docs:  # No relevant documents found
        return "I'm sorry, this information is not available in the resume."
    
    # Concatenate the documents
    context = "\n".join(relevant_docs)
    
    # Compute similarity
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    context_embedding = embedding_model.encode(context, convert_to_tensor=True)
    similarity = util.cos_sim(query_embedding, context_embedding).item()
    
    if similarity < 0.3:  # Low similarity threshold
        return f"I'm sorry, this information is not available in the resume. (Similarity: {similarity:.2f})"
    
    # Construct prompt
    prompt = (
        "You are an AI assistant answering questions about a person's resume. "
        "Respond only with information present in the context below. If the answer "
        "cannot be found, respond with 'I'm sorry, this information is not available in the resume.'\n\n"
        f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    )
    
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True)
    
    # Generate the response
    outputs = model.generate(
        inputs.input_ids,
        max_length=500,
        do_sample=True,
        temperature=0.7
    )
    
    # Decode the response
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the answer part
    answer = answer.split('Answer:')[-1].strip()
    
    return answer

# Example query
query = "Tell me something about the project Socio-Economic Impact of Pollution on Life Expectancy"
print(generate_response(query))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I'm sorry, this information is not available in the resume.


In [None]:
def generate_response(query):
    # Retrieve relevant documents
    relevant_docs = retrieve_relevant_documents(query)
    
    if not relevant_docs:  # No relevant documents found
        return "I'm sorry, this information is not available in the resume."
    
    # Concatenate the documents
    context = "\n".join(relevant_docs)
    
    # Compute similarity
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)
    context_embedding = embedding_model.encode(context, convert_to_tensor=True)
    similarity = util.cos_sim(query_embedding, context_embedding).item()
    
    if similarity < 0.3:  # Low similarity threshold
        return f"I'm sorry, this information is not available in the resume. (Similarity: {similarity:.2f})"
    
    # Construct prompt
    prompt = (
        "You are an AI assistant answering questions about a person's resume. "
        "Respond only with information present in the context below. If the answer "
        "cannot be found, respond with 'I'm sorry, this information is not available in the resume.'\n\n"
        f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    )
    
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True)
    
    # Generate the response
    outputs = model.generate(
        inputs.input_ids,
        max_length=500,
        do_sample=True,
        temperature=0.7
    )
    
    # Decode the response
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the answer part
    answer = answer.split('Answer:')[-1].strip()
    
    return answer

# Example query
query = "Tell me something about the project Socio-Economic Impact of Pollution on Life Expectancy"
print(generate_response(query))
