In [None]:
%%writefile .env
OPENAI_API_KEY="my_openai_api_key"
WEAVIATE_API_KEY="my_weaviate_api_key"

Overwriting .env


In [None]:
!pip install langchain openai weaviate-client tiktoken unstructured chromadb faiss-cpu pdfplumber pypdf python-dotenv PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [None]:
import os
import fitz  # PyMuPDF
import tiktoken
import weaviate
from weaviate.classes.init import Auth
from openai import OpenAI
from dotenv import load_dotenv

# ✅ Load environment variables
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "OPENAI_API_KEY")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY", "cDlCMTJKWHdHU2tsbVJNcF9yT25VZ3duTEViamxWZnhHd040Z25qMUNxL1ptY0lZS2pJSXVaam9KNFNFPV92MjAw")
WEAVIATE_URL = "https://urkn4zerdyxyoi5dok8sq.c0.asia-southeast1.gcp.weaviate.cloud"  # Update this

# ✅ Connect to Weaviate
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
    headers={"X-OpenAI-Api-Key": OPENAI_API_KEY}
)

# ✅ OpenAI client
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
# ✅ Create collection (if not exists)
def create_collection():
    name = "internal_documents"
    if name not in client.collections.list_all():
        client.collections.create(
            name=name,
            properties=[
                {"name": "topic", "dataType": "text", "description": "Topic name"},
                {"name": "chunk_id", "dataType": "text", "description": "Chunk identifier"},
                {"name": "content", "dataType": "text", "description": "Chunk content"}
            ],
            vectorizer_config=weaviate.classes.config.Configure.Vectorizer.text2vec_openai()
        )
        print(f"✅ Created collection: {name}")
    else:
        print("✅ Collection already exists")
    return name


# ✅ Utilities for chunking & embedding
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text("text") for page in doc)

def chunk_text(text, max_tokens=800):
    encoder = tiktoken.get_encoding("cl100k_base")
    tokens = encoder.encode(text)
    return [encoder.decode(tokens[i:i + max_tokens]) for i in range(0, len(tokens), max_tokens)]

def get_embedding(text, model="text-embedding-ada-002"):
    response = openai_client.embeddings.create(input=text, model=model)
    return response.data[0].embedding

# ✅ Store documents in Weaviate
def add_pdf_to_collection(pdf_path, topic, collection_name):
    text = extract_text_from_pdf(pdf_path)
    chunks = chunk_text(text)
    collection = client.collections.get(collection_name)
    for idx, chunk in enumerate(chunks):
        embedding = get_embedding(chunk)
        collection.data.insert(
            properties={
                "topic": topic,
                "chunk_id": f"{topic}_{idx}",
                "content": chunk
            },
            vector=embedding
        )
    print(f"✅ Uploaded {len(chunks)} chunks for topic '{topic}'")

# ✅ Search

def search_docs(query, topic_filter=None, collection_name="internal_documents", top_k=5):
    embedding = get_embedding(query)
    collection = client.collections.get(collection_name)
    results = collection.query.near_vector(embedding, limit=top_k)
    if topic_filter:
        return [r for r in results.objects if r.properties.get("topic") == topic_filter]
    return results.objects

# ✅ Conversational Memory
chat_memory = []

In [None]:

def generate_answer(query, topic=None, collection_name="internal_documents"):
    results = search_docs(query, topic_filter=topic, collection_name=collection_name)
    if not results:
        return "No relevant content found."

    context = "\n\n".join(r.properties["content"] for r in results)

    chat_memory.append({"role": "user", "content": query})

    messages = [
        {"role": "system", "content": "You are an internal knowledge assistant. Answer using the context only."},
        {"role": "system", "content": f"Context:\n{context}"}
    ] + chat_memory

    completion = openai_client.chat.completions.create(
        model="gpt-4o",
        messages=messages
    )

    answer = completion.choices[0].message.content
    chat_memory.append({"role": "assistant", "content": answer})
    return answer


In [None]:

pdf_path = "/content/HR Policy Manual 2023 (8).pdf"   # ← upload your topic-specific PDF here
topic_name = "HR Manual"             # ← name of the topic
collection_name = "internal_documents" # Assuming this is the collection name
add_pdf_to_collection(pdf_path, topic_name, collection_name)

q1 = "What is the leave policy?"
print("Q1:", q1)
print("A1:", generate_answer(q1, topic_name))

q2 = "Can you explain it in simpler terms?"
print("Q2:", q2)
print("A2:", generate_answer(q2, topic_name))

q3 = "What about maternity leave?"
print("Q3:", q3)
print("A3:", generate_answer(q3, topic_name))

✅ Uploaded 129 chunks for topic 'HR Manual'
Q1: What is the leave policy?
A1: The leave policy of the Institute, as outlined in the HR Policy Manual, includes the following key types of leaves:

1. **Casual Leave (CL):**
   - Entitlement: 8 days per year.
   - Restrictions: No more than 5 days at a time, and cannot be combined with other leaves except special casual leave.
   - Carry Forward: Unused leave lapses at the year's end.

2. **Earned Leave (EL):**
   - Entitlement: 30 days per year, credited as 15 days on January 1 and July 1.
   - Restrictions: Up to 180 days can be availed at a time.
   - Accumulation: Up to 300 days.

3. **Half Pay Leave (HPL):**
   - Credited at 10 days biannually.
   - Usage: Can be availed for medical reasons.

4. **Maternity Leave:**
   - Entitlement: 180 days for pregnancy (if less than two children), 45 days for miscarriage.
   - Conditions: Not debited to the leave account, full pay is provided.

5. **Paternity Leave:**
   - Entitlement: 15 days for

In [None]:
q4 = "So I get full pay during th ematernitiy leave duration right?"

print("Q4:", q4)
print("A4:", generate_answer(q4, topic_name))

INFO:weaviate-client:Searching in collection Internal_documents received exception: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.UNAVAILABLE
	details = "recvmsg:Connection reset by peer"
	debug_error_string = "UNKNOWN:Error received from peer  {grpc_status:14, grpc_message:"recvmsg:Connection reset by peer"}"
>. Retrying with exponential backoff in 1 seconds


Q4: So I get full pay during th ematernitiy leave duration right?
A4: Yes, during your maternity leave, you receive full pay.
