<a href="https://colab.research.google.com/github/bottasai/collabnotebooks/blob/main/RAG_OpenAI_CustomerTickets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 📦 Step 1: Install required packages
!pip install qdrant-client openai pandas transformers sentence-transformers spacy
!python -m spacy download en_core_web_sm


Collecting qdrant-client
  Downloading qdrant_client-1.15.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from

In [7]:
import os
import pandas as pd
import openai
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct
from google.colab import userdata

# 🔑 Set your OpenAI API key
openai.api_key = userdata.get('OPENAI_API_KEY')  # Replace with your key

# 🔗 Qdrant details
QDRANT_URL = "https://422d35a8-881d-485b-928d-37338f624d1f.us-east4-0.gcp.cloud.qdrant.io"
QDRANT_API_KEY = userdata.get('QDRANT_API_KEY')
COLLECTION_NAME = "support_tickets_rag"

# 1️⃣ Read CSV
def read_csv(path, text_columns):
    df = pd.read_csv(path)
    df.dropna(subset=text_columns, how='all', inplace=True)
    combined = df[text_columns].astype(str).agg(" ".join, axis=1)
    return combined.tolist(), df.to_dict(orient='records')

# 2️⃣ OpenAI Embedder with token batching
class OpenAIEmbedder:
    def __init__(self, model="text-embedding-3-small", dim=1536, max_tokens=300000):
        self.model = model
        self.dim = dim
        self.max_tokens = max_tokens

    def _estimate_tokens(self, text):
        # You can refine this using tiktoken if needed
        return len(text.split())

    def embed(self, texts):
        batches = []
        current_batch, current_tokens = [], 0
        max_records_per_batch = 1000  # safety cap

        for text in texts:
            tokens = self._estimate_tokens(text)
            if tokens > self.max_tokens:
                print(f"[WARNING] Skipping text exceeding token limit: {tokens} tokens")
                continue

            if (current_tokens + tokens > self.max_tokens) or (len(current_batch) >= max_records_per_batch):
                batches.append(current_batch)
                current_batch = [text]
                current_tokens = tokens
            else:
                current_batch.append(text)
                current_tokens += tokens

        if current_batch:
            batches.append(current_batch)

        all_embeddings = []
        for i, batch in enumerate(batches):
            try:
                print(f"🔁 Embedding batch {i+1}/{len(batches)} with {len(batch)} records…")
                response = openai.embeddings.create(input=batch, model=self.model)
                all_embeddings.extend([r.embedding for r in response.data])
            except Exception as e:
                print(f"[ERROR] embedding batch {i+1}: {e}")
        return all_embeddings

# 3️⃣ Upload to Qdrant
def upload_to_qdrant(client, collection_name, embeddings, records):
    client.recreate_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
    )
    batch_size = 1000
    for start in range(0, len(embeddings), batch_size):
        batch_points = [
            PointStruct(
                id=start + idx,
                vector=embeddings[start + idx],
                payload=records[start + idx]
            )
            for idx in range(min(batch_size, len(embeddings) - start))
        ]
        client.upsert(collection_name=collection_name, points=batch_points, wait=True)
        print(f"✅ Uploaded {start} to {start + len(batch_points) - 1}")

# 4️⃣ Run pipeline
csv_path = "/content/drive/MyDrive/Datasets/customer_support_tickets.csv"  # or change this
columns = ["Ticket ID", "Product Purchased", "Ticket Subject", "Ticket Description", "Resolution"]

print("📥 Reading CSV…")
texts, records = read_csv(csv_path, columns)

print("🔍 Embedding records…")
embedder = OpenAIEmbedder()
embeddings = embedder.embed(texts)
print(f"→ {len(embeddings)} embeddings ready.")

print("🚀 Uploading to Qdrant…")
client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
upload_to_qdrant(client, COLLECTION_NAME, embeddings, records)
print("✅ All records uploaded to Qdrant.")


📥 Reading CSV…
🔍 Embedding records…
🔁 Embedding batch 1/9 with 1000 records…
🔁 Embedding batch 2/9 with 1000 records…
🔁 Embedding batch 3/9 with 1000 records…
🔁 Embedding batch 4/9 with 1000 records…
🔁 Embedding batch 5/9 with 1000 records…
🔁 Embedding batch 6/9 with 1000 records…
🔁 Embedding batch 7/9 with 1000 records…
🔁 Embedding batch 8/9 with 1000 records…
🔁 Embedding batch 9/9 with 469 records…
→ 8469 embeddings ready.
🚀 Uploading to Qdrant…


  client.recreate_collection(


✅ Uploaded 0 to 999
✅ Uploaded 1000 to 1999
✅ Uploaded 2000 to 2999
✅ Uploaded 3000 to 3999
✅ Uploaded 4000 to 4999
✅ Uploaded 5000 to 5999
✅ Uploaded 6000 to 6999
✅ Uploaded 7000 to 7999
✅ Uploaded 8000 to 8468
✅ All records uploaded to Qdrant.


In [12]:
query = "Nintendo switch data loss"
query_embed_input = f"Ticket Description: {query}"
print(f"\nSearching for: “{query}”")

q_embed = embedder.embed([query_embed_input])[0]

results = client.search(
    collection_name=COLLECTION_NAME,
    query_vector=q_embed,
    limit=15,
    with_payload=True,
    search_params={"hnsw_ef": 300},
    score_threshold=0.5
)

print("\nTop Results:")
for hit in results:
    desc = hit.payload.get("Ticket Description", "N/A").replace("\n", " ")[:200]
    subject = hit.payload.get("Ticket Subject", "N/A")
    print(f"  • Score: {hit.score:.4f} — Subject: {subject} | Description: {desc}…")


Searching for: “Nintendo switch data loss”
🔁 Embedding batch 1/1 with 1 records…


  results = client.search(



Top Results:
  • Score: 0.6476 — Subject: Data loss | Description: I'm having an issue with the {product_purchased}. Please assist.  A browser error has occurred.  Please hold the Shift key and click the Refresh button to try again. This problem started occurring aft…
  • Score: 0.6455 — Subject: Data loss | Description: I'm having an issue with the {product_purchased}. Please assist.  The package, which was sent by the USPS in the previous week, contains 3 1/3rds of 2 of 2 $1 items. I'm I've noticed that the issue oc…
  • Score: 0.6429 — Subject: Data loss | Description: I'm having an issue with the {product_purchased}. Please assist.  We will NOT make any money without payment of the payment. If such payment is not possible at the time listed on the product, we will …
  • Score: 0.6345 — Subject: Data loss | Description: I'm having an issue with the {product_purchased}. Please assist.  {product_purchased} is a reference to the product that is currently not marked off by the seller. 