In [None]:
!pip install PyPDF2 langchain faiss-cpu tiktoken transformers openai sentence-transformers


In [None]:
from google.colab import files

uploaded = files.upload()  # This will allow you to upload your PDFs
pdf_path = list(uploaded.keys())[0]  # Get the uploaded file's name
print(f"Uploaded file: {pdf_path}")


In [None]:
from PyPDF2 import PdfReader

def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text_data = []
    for page in reader.pages:
        text_data.append(page.extract_text())
    return text_data

# Extract text from the uploaded PDF
pdf_text = extract_text_from_pdf(pdf_path)
print(pdf_text[:2])  # Print text from the first two pages


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text(text_list):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50
    )
    chunks = []
    for text in text_list:
        chunks.extend(text_splitter.split_text(text))
    return chunks

chunks = chunk_text(pdf_text)
print(f"Total chunks created: {len(chunks)}")
print(chunks[:3])  # Preview the first three chunks


In [None]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_chunks(chunks):
    embeddings = embedding_model.encode(chunks, convert_to_tensor=True)
    return embeddings

chunk_embeddings = embed_chunks(chunks)


In [None]:
import faiss
import numpy as np

def store_embeddings(embeddings, chunks):
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    return index, chunks

faiss_index, stored_chunks = store_embeddings(chunk_embeddings, chunks)


In [None]:
def retrieve_relevant_chunks(query, index, model, stored_chunks, top_k=5):
    query_embedding = model.encode([query], convert_to_tensor=True)
    distances, indices = index.search(query_embedding.cpu().numpy(), top_k)
    return [stored_chunks[idx] for idx in indices[0]]

query = "What is the unemployment rate for people with a bachelor's degree?"
relevant_chunks = retrieve_relevant_chunks(query, faiss_index, embedding_model, stored_chunks)
print(f"Relevant Chunks: {relevant_chunks}")


In [None]:
import openai
openai.api_key = "YOUR_OPENAI_API_KEY"


In [None]:
def generate_response(query, context):
    prompt = f"Answer the question based on the context below:\n\nContext: {context}\n\nQuestion: {query}\n\nAnswer:"
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=300
    )
    return response['choices'][0]['text'].strip()

context = "\n".join(relevant_chunks)
response = generate_response(query, context)
print(response)


In [None]:
def handle_comparison(query, relevant_chunks):
    # Extract data points for comparison
    comparison_data = []  # Extract or parse specific patterns
    for chunk in relevant_chunks:
        if "Bachelor's" in chunk or "Master's" in chunk:  # Example heuristic
            comparison_data.append(chunk)
    return "\n".join(comparison_data)

comparison_query = "Compare unemployment rates for different degrees."
comparison_chunks = retrieve_relevant_chunks(comparison_query, faiss_index, embedding_model, stored_chunks)
comparison_response = handle_comparison(comparison_query, comparison_chunks)
print(comparison_response)
