In [None]:
# Install dependencies (only needed once)
!pip install openai pinecone-client tiktoken --quiet

In [None]:
# Import libraries
import openai
import pinecone
import os
import tiktoken
from google.colab import files

In [None]:
# Set your API keys here
openai.api_key = "sk-REPLACE_WITH_YOUR_OPENAI_KEY"
pinecone.init(
    api_key="REPLACE_WITH_YOUR_PINECONE_API_KEY",
    environment="REPLACE_WITH_YOUR_ENVIRONMENT"  # e.g., "gcp-starter"
)

In [None]:
# Upload your business document (TXT preferred for simplicity)
uploaded = files.upload()

all_text = ""
for filename in uploaded:
    with open(filename, "r", encoding="utf-8") as f:
        all_text += f.read() + "\n"
print("✅ Document uploaded and read.")

In [None]:
# Split text into chunks
def split_text(text, max_tokens=500, overlap=50):
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    chunks = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        chunk = encoding.decode(tokens[start:end])
        chunks.append(chunk)
        start += max_tokens - overlap
    return chunks

chunks = split_text(all_text)
print(f"✅ Split into {len(chunks)} chunks")

In [None]:
# Embed text chunks using OpenAI
def get_embeddings(texts):
    embeddings = []
    for i in range(0, len(texts), 100):
        response = openai.Embedding.create(
            input=texts[i:i+100],
            model="text-embedding-3-small"
        )
        embeddings.extend([r["embedding"] for r in response["data"]])
    return embeddings

embeddings = get_embeddings(chunks)
print("✅ Embeddings created.")

In [None]:
# Create Pinecone index and store embeddings
index_name = "rag-qa-business"
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536, metric="cosine")
index = pinecone.Index(index_name)

ids = [f"chunk-{i}" for i in range(len(chunks))]
to_upsert = list(zip(ids, embeddings, [{"text": chunk} for chunk in chunks]))
index.upsert(vectors=to_upsert)
print("✅ Chunks uploaded to Pinecone.")

In [None]:
# Retrieval + Generation (QA Function)
def retrieve_chunks(query, top_k=3):
    query_embedding = openai.Embedding.create(
        input=[query],
        model="text-embedding-3-small"
    )["data"][0]["embedding"]
    
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True)
    return [match["metadata"]["text"] for match in results["matches"]]

def generate_answer(query):
    context = "\n---\n".join(retrieve_chunks(query))
    prompt = f"""You are a helpful business assistant. Use the context below to answer the question.
Context:
{context}

Question: {query}
Answer:"""

    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
    )
    return response.choices[0].message["content"]

In [None]:
# ✅ Example Question
query = "What is the refund policy?"
answer = generate_answer(query)
print("Answer:", answer)