In [2]:
# ==============================================================================
# Step 1: Install Required Libraries (Adding a standard text splitter library)
# ==============================================================================
!pip install -q pypdf transformers sentence-transformers faiss-cpu accelerate langchain

# ==============================================================================
# Step 2: Import Libraries (Adding necessary imports for better chunking)
# ==============================================================================
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from pypdf import PdfReader
import io
# NEW IMPORT for better chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter

# ==============================================================================
# Step 3: Load and Extract Text from PDF (No Change - Original Function)
# ==============================================================================
def load_and_extract_text_from_pdf(file_path):
    """
    Reads a PDF file and returns all text content as a single string.
    """
    text = ""
    try:
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text() or ""
        return text
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        return None

pdf_file_path = 'Aayushman_Dash_Barclays_Updated_Resume.pdf'
print("Please upload your PDF file to the Colab session storage.")

# ==============================================================================
# Step 4: Robust Chunking and Embedding (MAJOR IMPROVEMENT HERE)
# Use RecursiveCharacterTextSplitter for semantically coherent chunks.
# ==============================================================================
knowledge_base_text = load_and_extract_text_from_pdf(pdf_file_path)

if not knowledge_base_text:
    print("Could not load knowledge base from PDF. Exiting.")
else:
    # Initialize the improved text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        # Set chunk size to capture an entire resume section (e.g., a job entry)
        chunk_size=512,
        # Overlap helps preserve context across chunk boundaries
        chunk_overlap=50,
        # Standard separators for splitting
        separators=["\n\n", "\n", " ", ""]
    )

    # Create the chunks
    # LangChain's splitter returns Document objects; we just want the text
    chunks = [doc.page_content for doc in text_splitter.create_documents([knowledge_base_text])]

    if not chunks:
        print("PDF is empty or could not be parsed. Exiting.")
    else:
        print(f"Knowledge base split into {len(chunks)} **semantic** chunks.")

        # Load embedding model (No Change)
        embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
        chunk_embeddings = embedding_model.encode(chunks, show_progress_bar=True)
        dimension = chunk_embeddings.shape[1]
        print(f"Embeddings created. Dimension: {dimension}")

        # ==============================================================================
        # Step 5: Build a FAISS Index for Fast Retrieval (No Change)
        # ==============================================================================
        index = faiss.IndexFlatL2(dimension)
        index.add(np.array(chunk_embeddings).astype('float32'))
        print(f"FAISS index created with {index.ntotal} vectors.")

        # ==============================================================================
        # Step 6: Define the Generation Pipeline with a More Capable LLM (No Change)
        # ==============================================================================
        print("Loading a more capable language model. This may take a few minutes...")
        model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.bfloat16
        )
        generator = pipeline(
            'text-generation',
            model=model,
            tokenizer=tokenizer
        )

        # ==============================================================================
        # Step 7: Define the Retrieval and Generation Pipeline (IMPROVED PROMPT/RETRIEVAL)
        # ==============================================================================
        def get_answer(query):
            # Retrieve relevant document chunks (Increased k to 5 for better context)
            query_embedding = embedding_model.encode([query])
            D, I = index.search(np.array(query_embedding).astype('float32'), k=5) # **k=5**

            retrieved_chunks = [chunks[i] for i in I[0]]

            print("\n--- Retrieved Chunks (Context) ---")
            for i, chunk in enumerate(retrieved_chunks):
                print(f"Chunk {i+1}: {chunk}")

            # Augment the prompt with retrieved context
            context = " ".join(retrieved_chunks)

            # Construct a **more robust and specific** prompt for resume extraction
            prompt_template = (
                f"You are an expert resume analyst. Your task is to extract and summarize information "
                f"from the provided context based *only* on the user's question. "
                f"Do not include any information not present in the context. "
                f"If the answer is not in the context, your ONLY response must be: 'Information not found in the resume.'\n\n"
                f"Context: {context}\n\n"
                f"Question: {query}\n\n"
                "Answer: **(Provide a structured, clean, bulleted, or list format for the answer)**"
            )

            # Generate the final answer (No Change to generation parameters)
            response = generator(
                prompt_template,
                max_new_tokens=512, # Increased output to accommodate longer lists
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
            )

            full_output = response[0]['generated_text']
            # Clean up the output to only return the answer
            if "Answer:" in full_output:
              answer = full_output.split("Answer:")[1].strip()
            else:
              answer = full_output

            # Clean up the final output by removing the prompt template text that the LLM may repeat
            answer = answer.replace("**(Provide a structured, clean, bulleted, or list format for the answer)**", "").strip()

            return answer, retrieved_chunks

        # ==============================================================================
        # Step 8: Interact with Your Learning Assistant (No Change)
        # ==============================================================================
        if __name__ == "__main__":
            print("\nProject setup complete. Your Resume RAG Assistant is ready.")
            while True:
                user_question = input("\nAsk your learning assistant a question (or type 'exit' to quit): \n")
                if user_question.lower() == 'exit':
                    break

                print("\nSearching your knowledge base...")
                try:
                    answer, sources = get_answer(user_question)
                    print("\n--- Final Answer ---")
                    print(answer)

                    print("\n--- Sources Used ---")
                    for source in sources:
                        print(f"- {source}")

                except Exception as e:
                    print(f"An error occurred: {e}")
                    print("Please try again with a different question or check your setup.")

Please upload your PDF file to the Colab session storage.
Knowledge base split into 9 **semantic** chunks.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings created. Dimension: 384
FAISS index created with 9 vectors.
Loading a more capable language model. This may take a few minutes...


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Device set to use cpu



Project setup complete. Your Resume RAG Assistant is ready.

Searching your knowledge base...

--- Retrieved Chunks (Context) ---
Chunk 1: Aayushman Dash
♂phone8328932885 /envel⌢peaayushmandash995@gmail.com /linkedinLinkedIn /githubGitHub /codeCodolio
Education
Bhartiya Vidya Bhavan’s Sardar Patel Institute of Technology, Mumbai Nov 2022 – Present
Bachelor of Technology in Computer Science and Engineering (AI and ML) CGPA: 8.62/10
Airforce School ChandanNagar, Pune 2022
XII CBSE, Pune, Maharashtra Percentage: 94.60%
Airforce Golden Jubilee Institute, New Delhi 2020
X CBSE, New Delhi Percentage: 95.80%
Work Experience
Chunk 2: Work Experience
NDNYA Business Solutions Pvt. Ltd. (Faroce) ὑ7 July 2024 – Oct 2024
Research Analyst Mumbai (Onsite)
• Built a machine learning-based Consultant Recommendation System , increasing match accuracy by 15% and
enhancing relevance across 30+ expert-client interactions
• Utilized React.js, Node.js, Express.js, and Python to develop APIs, manage consulta