<a href="https://colab.research.google.com/github/aishwarya-kumar/skillrec_for_gigworkers/blob/main/RAG3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# !pip install langchain-community chromadb pypdf pdfplumber
# !pip install pypdf

In [4]:
import os
import re
import pdfplumber
import chromadb
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline, AutoTokenizer
import ollama

In [5]:
# model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('all-MiniLM-L12-v2')
client = chromadb.Client()
generator = pipeline("text2text-generation", model="google/flan-t5-large")
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-large')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
def load_documents(path):
    documents = []
    print(f"Starting to load PDFs from directory: {path}")

    # Loop through all files in the directory and process PDFs
    for file in os.listdir(path):
        if file.endswith(".pdf"):
            pdf_path = os.path.join(path, file)
            print(f"Processing file: {pdf_path}")

            with pdfplumber.open(pdf_path) as pdf:
                text = ""

                # Extract text from each page of the PDF
                for i, page in enumerate(pdf.pages):
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text
                    else:
                        print(f"Warning: No text found on page {i+1} of {file}")

                # If there's any text extracted, add it to the documents list
                if text.strip():
                    documents.append({"page_content": text, "metadata": {"source": file}})
                    print(f"Extracted text from {file}")
                else:
                    print(f"Warning: No text extracted from {file}. Skipping.")

    # Check if any documents were loaded
    if not documents:
        raise ValueError("No content extracted from any PDF in the directory.")

    print(f"Loaded {len(documents)} documents successfully from {path}.")
    return documents

In [7]:
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    return text.strip()

In [21]:
def preprocess_text(docs, chunk_size, chunk_overlap, max_tokens=512):
    all_chunks = []

    # Helper function to chunk text with overlap
    def chunk_text(text, chunk_size, chunk_overlap):
        tokens = tokenizer.encode(text, truncation=False)
        chunks = []

        # Loop through tokens, creating chunks with overlap
        for i in range(0, len(tokens), chunk_size - chunk_overlap):
            chunk = tokens[i:i + chunk_size]  # Get the chunk based on chunk_size
            chunks.append(tokenizer.decode(chunk))  # Decode the chunk back to text

        return chunks

    for doc in docs:
        # Clean the text
        cleaned_content = clean_text(doc['page_content'])

        # Split the cleaned content into chunks with overlap
        chunks = chunk_text(cleaned_content, chunk_size, chunk_overlap)

        # Add the chunks to the all_chunks list with metadata
        for chunk in chunks:
            all_chunks.append({"page_content": chunk, "metadata": doc['metadata']})

    print(f"Split {len(docs)} documents into {len(all_chunks)} chunks.")
    return all_chunks

In [9]:
def get_embeddings(chunks):
    print("Starting to generate embeddings for the chunks...")

    # Check if there are any chunks
    if not chunks:
        print("No chunks found! Exiting.")
        return []

    # Extracting page content from chunks and generating embeddings
    page_contents = [chunk['page_content'] for chunk in chunks]
    print(f"Extracted {len(page_contents)} page contents from the chunks.")

    # Generate embeddings using the model
    try:
        print("Generating embeddings using the SentenceTransformer model...")
        embeddings = model.encode(page_contents)
        print(f"Generated embeddings for {len(page_contents)} chunks.")
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        return []

    return embeddings

In [10]:
def build_chromadb_index(documents, embeddings):
    collection_name = "tech_jobs"

    # Get a list of all collections
    collections = client.list_collections()  # Get a list of all collections
    collection_names = [collection.name for collection in collections]

    # Check if the collection exists and delete it if it does
    if collection_name in collection_names:
      client.delete_collection(name=collection_name)
      print(f"Deleted existing collection '{collection_name}'.")
    else:
      print(f"No existing collection named '{collection_name}', proceeding to create a new one.")

    # Create a new collection
    collection = client.create_collection(name=collection_name)
    print(f"Created a new collection '{collection_name}'.")

    # Ensure there is content before adding to ChromaDB
    documents_text = [doc['page_content'] for doc in documents]  # Access 'page_content' correctly
    if not documents_text:
        raise ValueError("No valid text content found in documents.")

    # Add the documents and their embeddings to the collection
    collection.add(
        documents=documents_text,
        embeddings=embeddings,
        # metadatas=[{"source": "pdf"}] * len(documents),
        ids=[str(i) for i in range(len(documents))]
    )
    print(f"Added {len(documents)} documents to ChromaDB collection.")

    return collection


In [11]:
def retrieve_relevant_chunks(query, collection):
    query_embedding = model.encode([query]).tolist()
    query_result = collection.query(query_embeddings=query_embedding, n_results=3)
    return query_result['documents']

In [12]:
# def query_refiner(conversation, query):
#     prompt = f"Refine the following query based on the conversation context (focusing on career advice and skills for gig workers):\n{conversation}\nQuery: {query}\nRefined Query:"
#     return pipeline("text2text-generation", model="google/flan-t5-base")(prompt)[0]['generated_text']

def query_refiner(query):
    prompt = f"Refine the following query based on career advice and skills for gig workers:\nQuery: {query}\nRefined Query:"
    refined_query = pipeline("text2text-generation", model="google/flan-t5-base")(prompt)[0]['generated_text']
    # refined_query = ollama.chat(model="llama2", messages=[{"role": "user", "content": prompt}])['text']
    return refined_query

In [13]:
# def generate_response(retrieved_chunks, query):
#     # Flatten the retrieved chunks (if it's a list of lists) into a single list of strings
#     if isinstance(retrieved_chunks[0], list):
#         retrieved_chunks = [item for sublist in retrieved_chunks for item in sublist]

#     # Combine retrieved chunks to create context
#     context = " ".join(retrieved_chunks)

#     # Refine the query based on the user query (no conversation history)
#     refined_query = query_refiner(query)

#     # Structured prompt with required format
#     prompt = f"""
#     Based on the provided context, structure your response as follows:

#     Here are the top 3 job roles/careers for gig workers or freelancers:
#     1. [Role]
#     2. [Role]
#     3. [Role]

#     Here are the top 5 skills needed for each:
#     1. [Role]: top 5 skills:
#        a. [Skill]
#        b. [Skill]
#        c. [Skill]
#        d. [Skill]
#        e. [Skill]

#     Context: {context}
#     Refined Query: {refined_query}
#     Answer:
#     """
#     response = generator(prompt, max_length=500, temperature=0.3)[0]['generated_text']

#     return response

In [45]:
def generate_response(retrieved_chunks, query):
    # Flatten the retrieved chunks (if it's a list of lists) into a single list of strings
    if isinstance(retrieved_chunks[0], list):
        retrieved_chunks = [item for sublist in retrieved_chunks for item in sublist]

    # Combine retrieved chunks to create context
    context = " ".join(retrieved_chunks)

    # Refine the query based on the user query (no conversation history)
    # refined_query = query_refiner(query)
    # print("Refined Query: \n")
    # print(refined_query)

    # Structured prompt with clear instructions for the model
    prompt = f"""
    You need identify the jobs and required job skills that are most trending. Use the context below to find the most important job roles and skills.
    Context: {context}
    Query: {query}

    Please ensure your answer is based on the information provided in the context and refined query. Provide detailed and accurate job roles and skills.
    """
    response = generator(prompt, max_length=500, temperature=0.3)[0]['generated_text']
    # response = ollama.chat(model="llama2", messages=[{"role": "user", "content": prompt}])['text']

    return response

In [14]:
def rag_pipeline(pdf_path, query):
    conversation_history = []

    documents = load_documents(pdf_path)
    chunks = preprocess_text(documents)
    document_embeddings = get_embeddings(chunks)
    collection = build_chromadb_index(chunks, document_embeddings)
    relevant_chunks = retrieve_relevant_chunks(query, collection)
    response = generate_response(relevant_chunks, query, conversation_history)

    return response

In [15]:
from datetime import datetime
# Get the current year and month
current_year = datetime.now().year
current_month = datetime.now().strftime("%B")

pdf_path = "/content/"  # Replace with the path to your PDF directory


In [16]:
# Construct the query dynamically
query_prompt = f"""
Identify the top 3 job roles or careers for gig workers or freelancers in {current_month} {current_year}.
For each role, provide a list of the top 5 most in-demand skills required to succeed.
"""

In [17]:
documents = load_documents(pdf_path)

Starting to load PDFs from directory: /content/
Processing file: /content/SSiTechJobsJune2024One-Pager.pdf
Extracted text from SSiTechJobsJune2024One-Pager.pdf
Processing file: /content/The-Job-Skills-of-2024-Report.pdf
Extracted text from The-Job-Skills-of-2024-Report.pdf
Processing file: /content/WEF_Future_of_Jobs_2023.pdf
Extracted text from WEF_Future_of_Jobs_2023.pdf
Processing file: /content/2024_Work_Trend_Index_Annual_Report_6_7_24_666b2e2fafceb.pdf
Extracted text from 2024_Work_Trend_Index_Annual_Report_6_7_24_666b2e2fafceb.pdf
Processing file: /content/GSR_2024_-_NAMER_English.pdf
Extracted text from GSR_2024_-_NAMER_English.pdf
Processing file: /content/comptia-state-of-the-tech-workforce-2024.pdf
Extracted text from comptia-state-of-the-tech-workforce-2024.pdf
Loaded 6 documents successfully from /content/.


In [18]:
documents[0]

{'page_content': 'Tech Employment\nTrends\nJune 2024 6-month trends\nSnapshot: Tech Employment Top Labor Categories (SSi)\nWhere did SSi place consultants YTD 2024?\n• Current tech unemployment rate dropped to\n2.5% vs. 4% national rate.\nAML Project 19.05%\n• Who’s in demand? AI skills were highest in\ndemand across 12% of postings. BLS notes\nBusiness Professional 13.23%\nthe fastest growing tech occupations over the\nnext decade are: Data Scientists, Infosec IT 10.05%\nAnalysts, Software Developers, Computer\nand Info Research Scientists. Aerospace 9.52%\nTechnical 7.41%\nState of Job Postings\n• Biggest jump in job postings: Data Scientists,\nEngineering 6.88%\nDatabase Admins, Software and Web\nDevelopers. Professional 5.82%\n• As of May, the total number of active job\npostings is 427,000.\nTop Jobs Filled (SSi)\nWhere Are the Tech Jobs?\n• AML Investigation Analyst\n• Integration Engineer\nAccording to CompTIA Monthly Tech Jobs Report\n• Scrum Master\n(May), the top States for T

In [24]:
chunks = preprocess_text(documents, chunk_size= 300, chunk_overlap= 100, max_tokens=512)

Split 6 documents into 2084 chunks.


In [25]:
chunks[0]

{'page_content': 'Tech Employment Trends June 2024 6-month trends Snapshot: Tech Employment Top Labor Categories (SSi) Where did SSi place consultants YTD 2024? • Current tech unemployment rate dropped to 2.5% vs. 4% national rate. AML Project 19.05% • Who’s in demand? AI skills were highest in demand across 12% of postings. BLS notes Business Professional 13.23% the fastest growing tech occupations over the next decade are: Data Scientists, Infosec IT 10.05% Analysts, Software Developers, Computer and Info Research Scientists. Aerospace 9.52% Technical 7.41% State of Job Postings • Biggest jump in job postings: Data Scientists, Engineering 6.88% Database Admins, Software and Web Developers. Professional 5.82% • As of May, the total number of active job postings is 427,000. Top Jobs Filled (SSi) Where Are the Tech Jobs? • AML Investigation Analyst • Integration Engineer According to CompTIA Monthly Tech Jobs Report • Scrum Master (May), the top States for Tech Job Postings are: Is Remo

In [26]:
document_embeddings = get_embeddings(chunks)

Starting to generate embeddings for the chunks...
Extracted 2084 page contents from the chunks.
Generating embeddings using the SentenceTransformer model...
Generated embeddings for 2084 chunks.


In [27]:
collection = build_chromadb_index(chunks, document_embeddings)

No existing collection named 'tech_jobs', proceeding to create a new one.
Created a new collection 'tech_jobs'.
Added 2084 documents to ChromaDB collection.


In [28]:
relevant_chunks = retrieve_relevant_chunks(query_prompt, collection)

In [29]:
relevant_chunks

[['Forum, Future of Jobs Survey 12. Talent outlook in 2027 8. Reskilling skill focus This bar chart shows the share of respondents that operate in the respective economy or region This bar chart shows the share of surveyed who expect their talent availability when hiring, companies that operate in the respective economy talent development of existing workforce, and or region that selects a particular level-3 skill in the talent retention of existing workforce to improve or Global Skills Taxonomy, based on responses to the worsen in five years, and their net effect of surveyed question, “Keeping in mind your current strategic companies that operate in the respective economy direction, please select the skill clusters on which or region, compared with the global average. It is you are focusing your organization’s reskilling and based on the responses to the question, “How upskilling efforts in the next five years”. would you rate talent availability, development and retention in your org

In [46]:
response = generate_response(relevant_chunks, query_prompt)

In [47]:
response

'Data Visualization 1 Linux 1 Media Strategy & Planning 2 Tableau Software 2 Software Security 2 Search Engine Optimization 3 Financial Analysis 3 Systems Design 3 Marketing Management 4 Knitr 4 Web Development Tools 4 Advertising 5 Microsoft Excel 5 System Security 5 The Job Skills of 2024 The Fastest-Growing Job Skills for Businesses, Governments, and Higher Education Institutions The Job Skills of 2024 The Fastest-Growing Job Skills for Businesses, Governments, and Higher Education Institutions The Job Skills of 2024 The Fastest-Growing Job Skills for Businesses, Governments, and Higher Education Institutions The Job Skills of 2024 The Fastest-Growing Job Skills for Businesses, Governments, and Higher Education Institutions The Job Skills of 2024 The Fastest-Growing Job Skills for Businesses, Governments, and Higher Education Institutions The Job Skills of 2024 The Fastest-Growing Job Skills for Businesses, Governments, and Higher Education Institutions The Job Skills of 2024 The Fa