In [12]:
import json
import pandas as pd

# Load JSON
with open("C:/Users/adityaacer7/Desktop/Pravahh/Conversational_Transcript_Dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# --- auto-detect structure ---
if isinstance(data, dict):
    # find first list inside dict
    for v in data.values():
        if isinstance(v, list):
            data = v
            break
    else:
        data = [data]  # single transcript fallback

rows = []

for convo in data:
    transcript_id = str(convo.get("transcript_id", "unknown"))

    base = {
        "transcript_id": transcript_id,
        "time_of_interaction": convo.get("time_of_interaction", ""),
        "domain": convo.get("domain", ""),
        "intent": convo.get("intent", ""),
        "reason_for_call": convo.get("reason_for_call", ""),
        "outcome_label": convo.get("outcome", "unknown")
    }

    turns = convo.get("conversation", [])

    for i, turn in enumerate(turns):
        rows.append({
            **base,
            "turn_id": i,
            "speaker": turn.get("speaker", ""),
            "text": turn.get("text", "")
        })

df = pd.DataFrame(rows)

df.to_csv("structured_transcripts.csv", index=False)

print("✅ Conversion complete.")
print("Rows:", len(df))
print(df.head())

✅ Conversion complete.
Rows: 84465
         transcript_id  time_of_interaction               domain  \
0  6794-8660-4606-3216  2025-10-03 20:22:00  E-commerce & Retail   
1  6794-8660-4606-3216  2025-10-03 20:22:00  E-commerce & Retail   
2  6794-8660-4606-3216  2025-10-03 20:22:00  E-commerce & Retail   
3  6794-8660-4606-3216  2025-10-03 20:22:00  E-commerce & Retail   
4  6794-8660-4606-3216  2025-10-03 20:22:00  E-commerce & Retail   

                   intent                                    reason_for_call  \
0  Delivery Investigation  Customer James Bailey reported a smart watch s...   
1  Delivery Investigation  Customer James Bailey reported a smart watch s...   
2  Delivery Investigation  Customer James Bailey reported a smart watch s...   
3  Delivery Investigation  Customer James Bailey reported a smart watch s...   
4  Delivery Investigation  Customer James Bailey reported a smart watch s...   

  outcome_label  turn_id   speaker  \
0       unknown        0     Agent   

In [18]:
import chromadb
import pandas as pd
from sentence_transformers import SentenceTransformer

df = pd.read_csv("structured_transcripts.csv")

model = SentenceTransformer("all-MiniLM-L6-v2")

client = chromadb.Client()
collection = client.create_collection("transcripts")

for i, row in df.iterrows():
    emb = model.encode(row["text"]).tolist()

    collection.add(
        ids=[str(i)],
        embeddings=[emb],
        documents=[row["text"]],
        metadatas=[{
            "transcript_id": row["transcript_id"],
            "speaker": row["speaker"]
        }]
    )

print("✅ Indexed with Chroma")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|█████████████████████| 103/103 [00:00<00:00, 698.01it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


✅ Indexed with Chroma
