<a href="https://colab.research.google.com/github/aishwarya-kumar/skillrec_for_gigworkers/blob/main/RAG3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# !pip install langchain-community chromadb pypdf pdfplumber
# !pip install pypdf

In [13]:
from sentence_transformers import SentenceTransformer
import chromadb
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [15]:
# Load the SentenceTransformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize ChromaDB client
client = chromadb.Client()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
# Function to load documents from a directory (PDF files)
def load_documents(path):
    document_loader = PyPDFDirectoryLoader(path)
    return document_loader.load()

In [17]:
# Function to preprocess the text into chunks
def preprocess_text(docs, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,  # Function to compute the length of the text
        add_start_index=True,  # Flag to add start index to each chunk
    )
    chunks = text_splitter.split_documents(docs)
    print(f"Split {len(docs)} documents into {len(chunks)} chunks.")
    return chunks

In [18]:
# Step 3: Generate embeddings using SentenceTransformer
def get_embeddings(documents):
    embeddings = model.encode([doc.page_content for doc in documents])  # Access the text through `.page_content`
    return embeddings


In [46]:
def build_chromadb_index(documents, embeddings):
    collection_name = "tech_jobs"

    # Try to delete the collection if it exists
    try:
        client.delete_collection(name=collection_name)
        print(f"Deleted existing collection '{collection_name}'.")
    except chromadb.errors.CollectionNotFoundError:
        print(f"No existing collection named '{collection_name}' found.")

    # Create a new collection
    collection = client.create_collection(name=collection_name)

    # Ensure there is content before adding to ChromaDB
    documents_text = [doc.page_content for doc in documents]
    if not documents_text:
        raise ValueError("No valid text content found in documents.")

    # Add the documents and their embeddings to the collection
    collection.add(
        documents=documents_text,
        embeddings=embeddings,
        metadatas=[{"source": "pdf"}] * len(documents),
        ids=[str(i) for i in range(len(documents))]
    )
    print(f"Added {len(documents)} documents to ChromaDB collection.")

    return collection

In [20]:
# Step 5: Retrieve relevant chunks using ChromaDB
def retrieve_relevant_chunks(query, collection):
    # Generate embedding for the query using the same model
    query_embedding = model.encode([query]).tolist()  # Embedding for the query text

    # Perform similarity search using ChromaDB
    query_result = collection.query(query_embeddings=query_embedding, n_results=3)

    return query_result['documents']


In [21]:
def query_refiner(conversation, query):
    # Load the query-refinement model (Flan-T5 or any available model on Hugging Face)
    model_name = "google/flan-t5-base"  # You can choose any model from Hugging Face
    refiner = pipeline("text2text-generation", model=model_name)

    prompt = f"Refine the following query based on the conversation context: {conversation}\nQuery: {query}\nRefined Query:"
    refined_query = refiner(prompt)[0]['generated_text']
    return refined_query

In [22]:
def generate_response(retrieved_chunks, query, conversation_history):

    # Flatten the retrieved chunks (if it's a list of lists) into a single list of strings
    if isinstance(retrieved_chunks[0], list):
        retrieved_chunks = [item for sublist in retrieved_chunks for item in sublist]

    # Combine retrieved chunks to create context
    context = " ".join(retrieved_chunks)

    # Update the conversation history with user query and the context
    conversation_history.append(f"User: {query}")
    conversation_history.append(f"Context: {context}")

    # Refine the query based on the conversation history and user query
    refined_query = query_refiner("\n".join(conversation_history), query)

    # Load the response generation model (Flan-T5 or any other available model)
    model_name = "google/flan-t5-base"
    generator = pipeline("text2text-generation", model=model_name)

    # Generate the final response using the refined query and context
    prompt = f"Context: {context}\nRefined Query: {refined_query}\nAnswer:"
    response = generator(prompt)[0]['generated_text']

    # Update conversation history with the bot's response
    conversation_history.append(f"Bot: {response}")

    return response, conversation_history

In [None]:
# # Get the conversation string to maintain context (for query refinement)
# def get_conversation_string():
#     conversation_string = ""
#     for i in range(len(st.session_state['responses'])-1):
#         conversation_string += "Human: "+st.session_state['requests'][i] + "\n"
#         conversation_string += "Bot: "+ st.session_state['responses'][i+1] + "\n"
#     return conversation_string

In [23]:
def rag_pipeline(pdf_path, query):
    # Initialize conversation history
    conversation_history = []

    # Step 1: Load and extract text from the PDF
    documents = load_documents(pdf_path)

    # Step 2: Preprocess and split text into chunks using LangChain
    chunks = preprocess_text(documents)

    # Step 3: Generate embeddings for the documents using SentenceTransformer
    document_embeddings = get_embeddings(chunks)

    # Step 4: Build ChromaDB index and store documents
    collection = build_chromadb_index(chunks, document_embeddings)

    # Step 5: Retrieve relevant chunks for the user's query
    relevant_chunks = retrieve_relevant_chunks(query, collection)

    # Step 6: Generate a response using Hugging Face's Flan-T5 model
    response, conversation_history = generate_response(relevant_chunks, query, conversation_history)

    return response, conversation_history

In [41]:
pdf_path = "/content/"  # Replace with the path to your PDF directory
query = "What are the in-demand skills for tech jobs in 2024?"
conversation_history = []

In [42]:
documents = load_documents(pdf_path)



In [43]:
chunks = preprocess_text(documents)

Split 563 documents into 2009 chunks.


In [44]:
document_embeddings = get_embeddings(chunks)

In [47]:
collection = build_chromadb_index(chunks, document_embeddings)

Deleted existing collection 'tech_jobs'.
Added 2009 documents to ChromaDB collection.


In [48]:
relevant_chunks = retrieve_relevant_chunks(query, collection)

In [49]:
response, conversation_history = generate_response(relevant_chunks, query, conversation_history)

In [50]:
print("Response:", response)
print("Updated Conversation History:")
for entry in conversation_history:
    print(entry)

Response: The Fastest-Growing Job Skills for 2024 The Job Skills of 2024 29
Updated Conversation History:
User: What are the in-demand skills for tech jobs in 2024?
Context: The Fastest-
Growing Job 
Skills for 2024 The Job Skills of 2024
1
The Job Skills 
of 2024
The Fastest-Growing Job Skills for 
Businesses, Governments, and  
Higher Education Institutions The Job Skills of 2024
29
Introduction   |   Business Skills   |   Data Science Skills   |   Tech Skills   |   Conclusion   |   Appendix
Regional Data: The Fastest-Growing Job Skills for 2024
BUSINESS SKILLS RANK
E-Commerce 1
Media Strategy & Planning 2
Customer Success 3
Search Engine Optimization 4
Marketing Management 5
DATA SCIENCE SKILLS RANK
Tableau Software 1
Power BI 2
R Programming 3
Data Model 4
MATLAB 5
TECH SKILLS RANK
System Security 1
React (Web Framework) 2
Systems Design 3
Interactive Design 4
Internet Of Things 5
APAC
Bot: The Fastest-Growing Job Skills for 2024 The Job Skills of 2024 29
