In [11]:
import os
import pickle
import time
from pdfminer.high_level import extract_text
from langchain_groq import ChatGroq
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import nltk
from google.colab import files

# Ensure NLTK resources are available for text processing
nltk.download('punkt')

# Initialize LLM (Groq-based model)
llm = ChatGroq(
    temperature=0,
    groq_api_key="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",
    model_name="llama-3.1-70b-versatile"
)

# Path for saving the FAISS index
FAISS_INDEX_PATH = "faiss_store_openai.pkl"

def preprocess_text(text):
    """Preprocess text to remove unnecessary whitespace and special characters."""
    return ' '.join(text.split())

def process_pdfs(uploaded_files):
    """Process uploaded PDF files, extract text, and build FAISS index."""
    all_text = ""

    for uploaded_file in uploaded_files.keys():
        print(f"Processing file: {uploaded_file}")
        try:
            extracted_text = extract_text(uploaded_file)
            preprocessed_text = preprocess_text(extracted_text)
            all_text += preprocessed_text + "\n"
        except Exception as e:
            print(f"Failed to process {uploaded_file}: {e}")

    # Split text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    text_chunks = text_splitter.split_text(all_text)

    # Create embeddings and vector store
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = FAISS.from_texts(text_chunks, embeddings)

    # Save the FAISS index
    with open(FAISS_INDEX_PATH, "wb") as f:
        pickle.dump(vectorstore, f)

    print("Text extraction and FAISS index creation completed successfully.")

def ask_questions():
    """Interactive query system to ask questions based on the FAISS index."""
    if not os.path.exists(FAISS_INDEX_PATH):
        print("FAISS index not found. Please process the PDFs first.")
        return

    with open(FAISS_INDEX_PATH, "rb") as f:
        vectorstore = pickle.load(f)

    retriever = vectorstore.as_retriever()
    qa_chain = RetrievalQA.from_llm(llm=llm, retriever=retriever)

    while True:
        query = input("\nAsk a Question (or type 'exit' to quit): ").strip()
        if query.lower() == 'exit':
            print("Exiting the query system.")
            break
        if query:
            try:
                response = qa_chain.run(query)
                print("\nAnswer:")
                print(response)
            except Exception as e:
                print(f"Error during query processing: {e}")
        else:
            print("Please enter a valid query.")

if __name__ == "__main__":
    # Upload files in Colab
    print("Upload your PDF files.")
    uploaded_files = files.upload()

    if uploaded_files:
        process_pdfs(uploaded_files)
        ask_questions()
    else:
        print("No files were uploaded. Exiting.")


Upload your PDF files.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Saving resume (1).pdf to resume (1) (2).pdf
Processing file: resume (1) (2).pdf
Text extraction and FAISS index creation completed successfully.

Ask a Question (or type 'exit' to quit): what is my name?

Answer:
Your name is Annapureddy Titoo Reddy.

Ask a Question (or type 'exit' to quit): what are my technical skills?

Answer:
According to the provided context, your technical skills include:

1. Languages: 
   - Java
   - Python
   - SQL
   - C
   - HTML
   - CSS
   - JavaScript

2. Development Tools: 
   - VS Code
   - Github

Ask a Question (or type 'exit' to quit): give my eamil?

Answer:
Your email address is annapureddytitoreddy@gmail.com.

Ask a Question (or type 'exit' to quit): give my linkedin profile?

Answer:
Your LinkedIn profile is: linkedin.com/Titoreddy07

Ask a Question (or type 'exit' to quit): exit
Exiting the query system.
