In [6]:
import os
import pandas as pd
import tiktoken
import fitz  # PyMuPDF
import docx
import csv
from tqdm import tqdm  # 👈 import tqdm

tokenizer = tiktoken.get_encoding("cl100k_base")
max_tokens_per_chunk = 500

def chunk_text(text, max_tokens=500):
    tokens = tokenizer.encode(text)
    return [tokenizer.decode(tokens[i:i + max_tokens]) for i in range(0, len(tokens), max_tokens)]

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    return [page.get_text() for page in doc]

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return ["\n".join(para.text for para in doc.paragraphs)]

def extract_text_from_csv(file_path):
    with open(file_path, encoding="utf-8") as f:
        return [f.read()]

def extract_text_from_excel(file_path):
    df = pd.read_excel(file_path, sheet_name=None)
    pages = []
    for sheet, data in df.items():
        pages.append(f"Sheet: {sheet}\n" + data.to_string(index=False))
    return pages

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    elif ext in [".csv"]:
        return extract_text_from_csv(file_path)
    elif ext in [".xls", ".xlsx", ".xlsm"]:
        return extract_text_from_excel(file_path)
    else:
        print(f"Unsupported file type: {ext}")
        return []

def process_folder(folder_path):
    all_chunks = []
    file_list = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    for filename in tqdm(file_list, desc="📄 Processing files"):
        file_path = os.path.join(folder_path, filename)

        try:
            pages = extract_text(file_path)
            for page_num, text in enumerate(pages, 1):
                chunks = chunk_text(text)
                for chunk_num, chunk in enumerate(chunks, 1):
                    all_chunks.append({
                        "file_name": filename,
                        "page_number": page_num,
                        "chunk_number": chunk_num,
                        "text": chunk
                    })
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

    return pd.DataFrame(all_chunks)

# 🔍 Usage
folder_path = "/Users/yasir/Desktop/Project/Dr.X Files"
df = process_folder(folder_path)
print(df.head())
df.to_csv("all_chunked_text.csv", index=False)


  warn(f"Print area cannot be set to Defined name: {defn.value}.")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn(msg)
📄 Processing files: 100%|██████████████████████| 10/10 [00:00<00:00, 14.26it/s]

                              file_name  page_number  chunk_number  \
0  Dataset summaries and citations.docx            1             1   
1  Dataset summaries and citations.docx            1             2   
2  Dataset summaries and citations.docx            1             3   
3  Dataset summaries and citations.docx            1             4   
4    Ocean_ecogeochemistry_A_review.pdf            1             1   

                                                text  
0  Table 1. Description of studies included in th...  
1  bock, Texas. Agronomy Journal, 112(1), 148–157...  
2  2010). Soil Organic Carbon Input from Urban Tu...  
3   (2018). Soil carbon and nitrogen accumulation...  
4  327\nOceanography and Marine Biology: An Annua...  





In [38]:
import os
import time
import pandas as pd
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import tiktoken
import ollama
import re
from tqdm import tqdm  # ✅ Added tqdm for progress bars

# ========== Embedding Setup ==========
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
tokenizer = tiktoken.get_encoding("cl100k_base")
performance_log = []

# ========== Performance Logger ==========
def log_performance(stage, text_or_token_list, start_time, end_time):
    if isinstance(text_or_token_list, list) and isinstance(text_or_token_list[0], int):
        total_tokens = len(text_or_token_list)
    else:
        total_tokens = len(tokenizer.encode(text_or_token_list))

    elapsed = end_time - start_time
    tokens_per_sec = total_tokens / elapsed if elapsed > 0 else 0

    log_entry = {
        "stage": stage,
        "tokens": total_tokens,
        "duration_sec": round(elapsed, 4),
        "tokens_per_sec": round(tokens_per_sec, 2),
        "timestamp": pd.Timestamp.now()
    }

    performance_log.append(log_entry)
    print(f"📊 [{stage}] {total_tokens} tokens in {elapsed:.2f}s → {tokens_per_sec:.2f} tokens/sec")

    return tokens_per_sec

def export_log(filename="performance_log.csv"):
    df = pd.DataFrame(performance_log)
    df.to_csv(filename, index=False)
    print(f"✅ Performance log saved to {filename}")

def summarize_performance():
    df = pd.DataFrame(performance_log)
    if df.empty:
        print("⚠️ No performance data to summarize.")
        return
    summary = df.groupby("stage")["tokens_per_sec"].agg(["count", "min", "max", "mean"]).reset_index()
    print("\n📈 Performance Summary:")
    print(summary.to_string(index=False))
    return summary

# ========== Preprocessing DataFrame ==========
def remove_tables_and_numeric_lines(text):
    lines = text.splitlines()
    filtered_lines = []
    tables_and_numbers = []

    for line in lines:
        stripped = line.strip().lower()

        # Skip short chemical/symbol lines like "CO", "⇌", "+", etc.
        if len(line.strip()) < 10 and re.match(r"^[A-Za-z0-9()+−⇌\\s]*$", line):
            tables_and_numbers.append(line)
            continue
        # Remove lines like '.348136          NaN                                                              NaN'
        if re.fullmatch(r"[.\d\s]+nan\s*nan", line.strip().lower()):
            tables_and_numbers.append(line)
            continue
        # Remove lines like '.348136          NaN    NaN' or 'NaN                  0'
        if re.fullmatch(r"[.\d\s]*nan\s*\d*", line.strip().lower()):
            tables_and_numbers.append(line)
            continue


        # Skip lines that are too short and contain mostly numbers
        if len(line.strip()) < 40 and sum(c.isdigit() for c in line) / max(len(line), 1) > 0.3:
            tables_and_numbers.append(line)
            continue
        stripped = line.strip().lower()

        numbers = re.findall(r"\d", line)
        if len(numbers) / max(len(line), 1) > 0.4:
            tables_and_numbers.append(line)
            continue

        if re.search(r"\t", line) or re.search(r"\|", line):
            tables_and_numbers.append(line)
            continue

        if len(re.findall(r"\s{2,}", line)) > 2:
            tables_and_numbers.append(line)
            continue

        if any(keyword in stripped for keyword in ["author", "editor", "publisher", "published by", "phone", "email", "contact", "address"]):
            continue

        if re.search(r"\b\w+@\w+\.\w+\b", line):
            continue
        if re.search(r"\+?\d[\d\s().-]{7,}\d", line):
            continue

        filtered_lines.append(line)

    return "\n".join(filtered_lines), "\n".join(tables_and_numbers)
    
# ========== DataFrame Cleaning ==========
def clean_dataframe_text(df):
    df = df.dropna(subset=['text'])  # 🧹 Drop rows where 'text' is NaN
    df = df[df['text'].str.strip().astype(bool)]  # 🧹 Drop rows where 'text' is empty or only whitespace
    df = df[df['text'].str.len() >= 10]  # 🧹 Drop rows where 'text' is too short
    df = df.dropna(subset=['text'])  # 🧹 Drop rows where 'text' is NaN
    df = df.dropna(subset=['text'])  # 🧹 Drop rows where 'text' is NaN
    df = df[df['text'].str.strip().astype(bool)]  # 🧹 Drop rows where 'text' is empty or only whitespace
    df = df[df['text'].str.strip().astype(bool)]  # 🧹 Drop rows where 'text' is empty or only whitespace
    cleaned_rows = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🧹 Cleaning text in DataFrame"):
        cleaned_text, _ = remove_tables_and_numeric_lines(row['text'])
        cleaned_rows.append({
            "file_name": row['file_name'],
            "page_number": row['page_number'],
            "chunk_number": row['chunk_number'],
            "text": cleaned_text
        })
        cleaned_df = pd.DataFrame(cleaned_rows)
    cleaned_df = cleaned_df[cleaned_df['text'].str.strip().astype(bool)]  # 🧹 Drop rows with empty cleaned text
    return cleaned_df



# ========== Embedding Function ==========
def embed_texts_local(texts):
    start = time.time()
    embeddings = embed_model.encode(texts).tolist()
    end = time.time()
    log_performance("embedding", " ".join(texts), start, end)
    return embeddings

# ========== Store to ChromaDB ==========
def store_df_in_chromadb_local(df, persist_dir="./chroma_storage", collection_name="cleaned_docs"):
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="🔁 Storing in ChromaDB"):
        doc_id = f"{row['file_name']}_{row['page_number']}_{row['chunk_number']}"
        try:
            embedding = embed_texts_local([row['text']])[0]
            collection.add(
                documents=[row['text']],
                ids=[doc_id],
                metadatas=[{
                    "file_name": row['file_name'],
                    "page": row['page_number'],
                    "chunk": row['chunk_number']
                }],
                embeddings=[embedding]
            )
        except Exception as e:
            print(f"❌ Error at row {idx}: {e}")
    print("✅ Chunks stored in ChromaDB using local embeddings.")

# ========== Ask Mistral via Ollama ==========
def ask_mistral(question, context):
    prompt = f""""Write a clear, formal, human-like summary in academic tone.
    
Context:
{context}

Question:
{question}

Answer:"""
    start = time.time()
    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}]
    )
    end = time.time()
    log_performance("RAG", prompt, start, end)
    return response['message']['content']

# ========== RAG Pipeline ==========
def rag_query(question, collection_name="cleaned_docs", persist_dir="./chroma_storage", top_k=1):
    query_vector = embed_texts_local([question])[0]

    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)
    results = collection.query(query_embeddings=[query_vector], n_results=top_k)

    retrieved_chunks = results["documents"][0]
    context = "\n\n".join(retrieved_chunks)

    return ask_mistral(question, context)

# ========== Summarize All Documents ==========
def summarize_all_documents(collection_name="cleaned_docs", persist_dir="./chroma_storage"):
    """
    Summarize all chunks grouped by document ID.
    """
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    results = collection.get(include=["documents", "metadatas"])
    summaries_by_file = {}

    for doc, meta in zip(results["documents"], results["metadatas"]):
        file_name = meta.get("file_name", "unknown")
        text = doc if isinstance(doc, str) else doc[0]
        if file_name not in summaries_by_file:
            summaries_by_file[file_name] = []
        summaries_by_file[file_name].append(text)

    final_summaries = {}
    for file, chunks in tqdm(list(summaries_by_file.items())[:1], desc="🧠 Summarizing files"):
        combined_text = "\n\n".join(chunks)
        summary = ask_mistral("Summarize all the chunks of this document.", combined_text)
        final_summaries[file] = summary

    return final_summaries


In [40]:
clean_df= clean_dataframe_text(df)
store_df_in_chromadb_local(clean_df)
response = rag_query("what is Emotional intelligence?")
print("🤖 Mistral says:\n", response)


🧹 Cleaning text in DataFrame: 100%|████████| 423/423 [00:00<00:00, 2223.02it/s]
🔁 Storing in ChromaDB:   0%|                   | 1/407 [00:00<02:20,  2.89it/s]

📊 [embedding] 154 tokens in 0.25s → 618.58 tokens/sec
📊 [embedding] 185 tokens in 0.04s → 4325.06 tokens/sec
📊 [embedding] 32 tokens in 0.04s → 811.68 tokens/sec


🔁 Storing in ChromaDB:   2%|▎                  | 8/407 [00:00<00:23, 17.16it/s]

📊 [embedding] 276 tokens in 0.04s → 6534.55 tokens/sec
📊 [embedding] 442 tokens in 0.01s → 31564.58 tokens/sec
📊 [embedding] 249 tokens in 0.04s → 5795.81 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 37721.69 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 36640.44 tokens/sec
📊 [embedding] 143 tokens in 0.04s → 3839.78 tokens/sec


🔁 Storing in ChromaDB:   3%|▍                 | 11/407 [00:00<00:19, 20.06it/s]

📊 [embedding] 498 tokens in 0.01s → 33532.88 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 33160.75 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 32654.73 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 39987.05 tokens/sec
📊 [embedding] 369 tokens in 0.01s → 30523.58 tokens/sec


🔁 Storing in ChromaDB:   5%|▊                 | 19/407 [00:01<00:19, 19.90it/s]

📊 [embedding] 130 tokens in 0.28s → 458.69 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 30188.80 tokens/sec
📊 [embedding] 171 tokens in 0.04s → 4659.16 tokens/sec
📊 [embedding] 439 tokens in 0.01s → 29746.84 tokens/sec
📊 [embedding] 494 tokens in 0.01s → 33250.73 tokens/sec
📊 [embedding] 163 tokens in 0.04s → 4229.36 tokens/sec
📊 [embedding] 493 tokens in 0.01s → 33358.48 tokens/sec
📊 [embedding] 317 tokens in 0.01s → 25158.84 tokens/sec


🔁 Storing in ChromaDB:   7%|█▏                | 28/407 [00:01<00:13, 27.43it/s]

📊 [embedding] 186 tokens in 0.04s → 4811.79 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 33155.50 tokens/sec
📊 [embedding] 308 tokens in 0.02s → 17480.29 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 34238.09 tokens/sec
📊 [embedding] 315 tokens in 0.01s → 26505.75 tokens/sec
📊 [embedding] 157 tokens in 0.04s → 4162.59 tokens/sec
📊 [embedding] 496 tokens in 0.01s → 33075.37 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 33505.65 tokens/sec


🔁 Storing in ChromaDB:   9%|█▌                | 36/407 [00:01<00:13, 28.51it/s]

📊 [embedding] 90 tokens in 0.04s → 2178.18 tokens/sec
📊 [embedding] 464 tokens in 0.02s → 26587.21 tokens/sec
📊 [embedding] 214 tokens in 0.04s → 5119.85 tokens/sec
📊 [embedding] 444 tokens in 0.01s → 33782.08 tokens/sec
📊 [embedding] 249 tokens in 0.04s → 6127.45 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 35177.40 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 35546.76 tokens/sec


🔁 Storing in ChromaDB:  11%|█▉                | 44/407 [00:01<00:11, 31.94it/s]

📊 [embedding] 498 tokens in 0.01s → 35762.21 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 35653.12 tokens/sec
📊 [embedding] 63 tokens in 0.04s → 1447.98 tokens/sec
📊 [embedding] 491 tokens in 0.02s → 26539.39 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 37645.44 tokens/sec
📊 [embedding] 193 tokens in 0.04s → 4810.21 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 32226.54 tokens/sec
📊 [embedding] 474 tokens in 0.02s → 23906.35 tokens/sec


🔁 Storing in ChromaDB:  13%|██▎               | 52/407 [00:02<00:10, 32.81it/s]

📊 [embedding] 37 tokens in 0.05s → 777.39 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 34742.66 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 32059.68 tokens/sec
📊 [embedding] 169 tokens in 0.05s → 3491.59 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 33442.69 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 35981.61 tokens/sec
📊 [embedding] 223 tokens in 0.01s → 15744.17 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 27199.92 tokens/sec
📊 [embedding] 471 tokens in 0.01s → 36912.45 tokens/sec


🔁 Storing in ChromaDB:  15%|██▋               | 60/407 [00:02<00:10, 33.33it/s]

📊 [embedding] 58 tokens in 0.05s → 1162.40 tokens/sec
📊 [embedding] 496 tokens in 0.02s → 31723.67 tokens/sec
📊 [embedding] 281 tokens in 0.01s → 20652.54 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 27295.53 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 31478.37 tokens/sec
📊 [embedding] 174 tokens in 0.05s → 3858.44 tokens/sec
📊 [embedding] 287 tokens in 0.05s → 5293.44 tokens/sec


🔁 Storing in ChromaDB:  17%|███               | 68/407 [00:02<00:11, 30.64it/s]

📊 [embedding] 432 tokens in 0.02s → 23575.46 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34664.24 tokens/sec
📊 [embedding] 112 tokens in 0.05s → 2472.20 tokens/sec
📊 [embedding] 455 tokens in 0.02s → 29311.43 tokens/sec
📊 [embedding] 472 tokens in 0.02s → 21557.50 tokens/sec
📊 [embedding] 160 tokens in 0.05s → 3441.53 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 37086.76 tokens/sec


🔁 Storing in ChromaDB:  18%|███▏              | 72/407 [00:02<00:10, 31.02it/s]

📊 [embedding] 500 tokens in 0.01s → 35426.06 tokens/sec
📊 [embedding] 73 tokens in 0.05s → 1474.31 tokens/sec
📊 [embedding] 480 tokens in 0.02s → 29426.40 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 30012.05 tokens/sec
📊 [embedding] 64 tokens in 0.05s → 1242.96 tokens/sec
📊 [embedding] 453 tokens in 0.02s → 27514.19 tokens/sec
📊 [embedding] 484 tokens in 0.02s → 28218.17 tokens/sec


🔁 Storing in ChromaDB:  19%|███▍              | 79/407 [00:03<00:12, 25.65it/s]

📊 [embedding] 63 tokens in 0.13s → 490.42 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 27768.35 tokens/sec
📊 [embedding] 489 tokens in 0.01s → 37017.92 tokens/sec
📊 [embedding] 89 tokens in 0.05s → 1943.11 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 23119.86 tokens/sec
📊 [embedding] 490 tokens in 0.01s → 42918.79 tokens/sec
📊 [embedding] 462 tokens in 0.01s → 40160.17 tokens/sec


🔁 Storing in ChromaDB:  21%|███▊              | 87/407 [00:03<00:10, 30.77it/s]

📊 [embedding] 23 tokens in 0.04s → 540.17 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 20156.36 tokens/sec
📊 [embedding] 474 tokens in 0.01s → 36073.16 tokens/sec
📊 [embedding] 478 tokens in 0.02s → 31126.32 tokens/sec
📊 [embedding] 488 tokens in 0.01s → 40108.57 tokens/sec
📊 [embedding] 493 tokens in 0.01s → 38033.95 tokens/sec
📊 [embedding] 478 tokens in 0.01s → 31981.84 tokens/sec
📊 [embedding] 477 tokens in 0.01s → 36445.63 tokens/sec
📊 [embedding] 478 tokens in 0.02s → 23807.22 tokens/sec


🔁 Storing in ChromaDB:  24%|████▏             | 96/407 [00:03<00:08, 34.95it/s]

📊 [embedding] 431 tokens in 0.02s → 21659.50 tokens/sec
📊 [embedding] 480 tokens in 0.01s → 37899.62 tokens/sec
📊 [embedding] 477 tokens in 0.02s → 28120.02 tokens/sec
📊 [embedding] 338 tokens in 0.02s → 16329.27 tokens/sec
📊 [embedding] 486 tokens in 0.02s → 21501.54 tokens/sec
📊 [embedding] 496 tokens in 0.01s → 36159.05 tokens/sec
📊 [embedding] 480 tokens in 0.02s → 22893.11 tokens/sec


🔁 Storing in ChromaDB:  26%|████▍            | 105/407 [00:03<00:08, 37.75it/s]

📊 [embedding] 10 tokens in 0.05s → 193.61 tokens/sec
📊 [embedding] 484 tokens in 0.01s → 34653.02 tokens/sec
📊 [embedding] 481 tokens in 0.01s → 35727.50 tokens/sec
📊 [embedding] 412 tokens in 0.01s → 30311.94 tokens/sec
📊 [embedding] 491 tokens in 0.01s → 39132.07 tokens/sec
📊 [embedding] 445 tokens in 0.01s → 36556.50 tokens/sec
📊 [embedding] 408 tokens in 0.01s → 30050.86 tokens/sec
📊 [embedding] 488 tokens in 0.02s → 29887.13 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 42394.11 tokens/sec
📊 [embedding] 450 tokens in 0.01s → 36901.48 tokens/sec


🔁 Storing in ChromaDB:  28%|████▊            | 114/407 [00:04<00:07, 38.49it/s]

📊 [embedding] 70 tokens in 0.05s → 1387.79 tokens/sec
📊 [embedding] 476 tokens in 0.01s → 34635.41 tokens/sec
📊 [embedding] 466 tokens in 0.02s → 30407.69 tokens/sec
📊 [embedding] 438 tokens in 0.01s → 29303.19 tokens/sec
📊 [embedding] 481 tokens in 0.02s → 31843.74 tokens/sec
📊 [embedding] 491 tokens in 0.02s → 30002.96 tokens/sec
📊 [embedding] 482 tokens in 0.02s → 30228.99 tokens/sec
📊 [embedding] 26 tokens in 0.01s → 3192.11 tokens/sec
📊 [embedding] 465 tokens in 0.01s → 33378.71 tokens/sec
📊 [embedding] 482 tokens in 0.01s → 35572.47 tokens/sec


🔁 Storing in ChromaDB:  31%|█████▏           | 125/407 [00:04<00:06, 41.28it/s]

📊 [embedding] 421 tokens in 0.01s → 33378.11 tokens/sec
📊 [embedding] 82 tokens in 0.05s → 1790.47 tokens/sec
📊 [embedding] 484 tokens in 0.02s → 25327.42 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 32886.70 tokens/sec
📊 [embedding] 446 tokens in 0.02s → 26313.96 tokens/sec
📊 [embedding] 93 tokens in 0.01s → 11297.22 tokens/sec
📊 [embedding] 487 tokens in 0.02s → 30570.91 tokens/sec
📊 [embedding] 475 tokens in 0.01s → 32917.43 tokens/sec
📊 [embedding] 386 tokens in 0.01s → 28444.95 tokens/sec


🔁 Storing in ChromaDB:  33%|█████▋           | 135/407 [00:04<00:06, 39.41it/s]

📊 [embedding] 491 tokens in 0.02s → 29997.28 tokens/sec
📊 [embedding] 487 tokens in 0.02s → 29410.19 tokens/sec
📊 [embedding] 489 tokens in 0.01s → 34024.80 tokens/sec
📊 [embedding] 17 tokens in 0.05s → 343.75 tokens/sec
📊 [embedding] 471 tokens in 0.01s → 36522.78 tokens/sec
📊 [embedding] 490 tokens in 0.02s → 32461.60 tokens/sec
📊 [embedding] 425 tokens in 0.01s → 33244.05 tokens/sec
📊 [embedding] 489 tokens in 0.02s → 32057.12 tokens/sec


🔁 Storing in ChromaDB:  34%|█████▊           | 140/407 [00:04<00:08, 32.67it/s]

📊 [embedding] 483 tokens in 0.02s → 31988.77 tokens/sec
📊 [embedding] 273 tokens in 0.02s → 17946.29 tokens/sec
📊 [embedding] 161 tokens in 0.05s → 3164.55 tokens/sec
📊 [embedding] 81 tokens in 0.05s → 1709.37 tokens/sec
📊 [embedding] 89 tokens in 0.05s → 1808.61 tokens/sec


🔁 Storing in ChromaDB:  35%|██████           | 144/407 [00:04<00:09, 27.66it/s]

📊 [embedding] 85 tokens in 0.06s → 1489.97 tokens/sec
📊 [embedding] 4 tokens in 0.05s → 73.63 tokens/sec
📊 [embedding] 458 tokens in 0.02s → 27769.62 tokens/sec
📊 [embedding] 130 tokens in 0.05s → 2588.87 tokens/sec
📊 [embedding] 398 tokens in 0.02s → 24530.26 tokens/sec
📊 [embedding] 422 tokens in 0.01s → 28469.35 tokens/sec


🔁 Storing in ChromaDB:  37%|██████▎          | 152/407 [00:05<00:09, 27.18it/s]

📊 [embedding] 237 tokens in 0.05s → 4330.35 tokens/sec
📊 [embedding] 393 tokens in 0.01s → 28398.48 tokens/sec
📊 [embedding] 329 tokens in 0.02s → 19520.53 tokens/sec
📊 [embedding] 383 tokens in 0.02s → 22767.67 tokens/sec
📊 [embedding] 107 tokens in 0.05s → 2220.83 tokens/sec
📊 [embedding] 119 tokens in 0.05s → 2615.20 tokens/sec


🔁 Storing in ChromaDB:  38%|██████▌          | 156/407 [00:05<00:10, 24.74it/s]

📊 [embedding] 498 tokens in 0.02s → 27303.74 tokens/sec
📊 [embedding] 282 tokens in 0.01s → 20638.16 tokens/sec
📊 [embedding] 455 tokens in 0.01s → 31938.28 tokens/sec
📊 [embedding] 241 tokens in 0.11s → 2269.30 tokens/sec
📊 [embedding] 467 tokens in 0.02s → 27036.49 tokens/sec


🔁 Storing in ChromaDB:  40%|██████▊          | 163/407 [00:05<00:09, 26.38it/s]

📊 [embedding] 332 tokens in 0.02s → 20749.96 tokens/sec
📊 [embedding] 485 tokens in 0.02s → 29006.67 tokens/sec
📊 [embedding] 113 tokens in 0.05s → 2149.02 tokens/sec
📊 [embedding] 342 tokens in 0.02s → 19548.54 tokens/sec
📊 [embedding] 264 tokens in 0.06s → 4704.55 tokens/sec
📊 [embedding] 384 tokens in 0.01s → 26158.20 tokens/sec


🔁 Storing in ChromaDB:  41%|██████▉          | 166/407 [00:05<00:09, 25.99it/s]

📊 [embedding] 498 tokens in 0.01s → 37343.36 tokens/sec
📊 [embedding] 220 tokens in 0.05s → 4127.81 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 32649.17 tokens/sec
📊 [embedding] 381 tokens in 0.02s → 24996.95 tokens/sec
📊 [embedding] 499 tokens in 0.01s → 34356.97 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 31387.79 tokens/sec
📊 [embedding] 495 tokens in 0.02s → 31442.51 tokens/sec


🔁 Storing in ChromaDB:  43%|███████▎         | 176/407 [00:06<00:06, 34.93it/s]

📊 [embedding] 498 tokens in 0.01s → 33295.56 tokens/sec
📊 [embedding] 471 tokens in 0.02s → 30326.79 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 34368.23 tokens/sec
📊 [embedding] 465 tokens in 0.01s → 36747.77 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 36466.50 tokens/sec
📊 [embedding] 495 tokens in 0.01s → 39179.13 tokens/sec
📊 [embedding] 430 tokens in 0.02s → 28522.30 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 36085.00 tokens/sec
📊 [embedding] 473 tokens in 0.01s → 35483.28 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34489.80 tokens/sec


🔁 Storing in ChromaDB:  46%|███████▊         | 186/407 [00:06<00:05, 39.77it/s]

📊 [embedding] 469 tokens in 0.01s → 32571.59 tokens/sec
📊 [embedding] 467 tokens in 0.02s → 29227.95 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 37056.74 tokens/sec
📊 [embedding] 329 tokens in 0.01s → 26296.83 tokens/sec
📊 [embedding] 370 tokens in 0.01s → 25524.13 tokens/sec
📊 [embedding] 499 tokens in 0.02s → 33046.35 tokens/sec
📊 [embedding] 467 tokens in 0.01s → 32919.45 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 35612.55 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34556.29 tokens/sec
📊 [embedding] 499 tokens in 0.01s → 33810.28 tokens/sec


🔁 Storing in ChromaDB:  48%|████████▏        | 196/407 [00:06<00:04, 42.41it/s]

📊 [embedding] 500 tokens in 0.02s → 32450.59 tokens/sec
📊 [embedding] 464 tokens in 0.01s → 31970.78 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 38711.41 tokens/sec
📊 [embedding] 465 tokens in 0.01s → 32453.93 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 33579.68 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 36472.85 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 35696.81 tokens/sec
📊 [embedding] 496 tokens in 0.02s → 32791.24 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 34587.33 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 35068.26 tokens/sec


🔁 Storing in ChromaDB:  51%|████████▌        | 206/407 [00:06<00:04, 43.01it/s]

📊 [embedding] 500 tokens in 0.01s → 36194.61 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 37018.80 tokens/sec
📊 [embedding] 479 tokens in 0.01s → 32633.87 tokens/sec
📊 [embedding] 476 tokens in 0.02s → 30254.87 tokens/sec
📊 [embedding] 315 tokens in 0.01s → 21005.53 tokens/sec
📊 [embedding] 413 tokens in 0.02s → 27235.31 tokens/sec
📊 [embedding] 477 tokens in 0.01s → 32785.72 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 35910.38 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 38793.04 tokens/sec
📊 [embedding] 330 tokens in 0.01s → 24186.49 tokens/sec


🔁 Storing in ChromaDB:  53%|█████████        | 216/407 [00:06<00:04, 44.37it/s]

📊 [embedding] 401 tokens in 0.01s → 32493.26 tokens/sec
📊 [embedding] 419 tokens in 0.01s → 32696.06 tokens/sec
📊 [embedding] 434 tokens in 0.01s → 32317.72 tokens/sec
📊 [embedding] 478 tokens in 0.01s → 33127.52 tokens/sec
📊 [embedding] 424 tokens in 0.01s → 30338.03 tokens/sec
📊 [embedding] 474 tokens in 0.01s → 34216.23 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 32404.96 tokens/sec
📊 [embedding] 475 tokens in 0.01s → 31885.38 tokens/sec
📊 [embedding] 17 tokens in 0.01s → 2121.55 tokens/sec
📊 [embedding] 459 tokens in 0.01s → 32391.99 tokens/sec


🔁 Storing in ChromaDB:  56%|█████████▍       | 226/407 [00:07<00:04, 40.45it/s]

📊 [embedding] 489 tokens in 0.01s → 36596.51 tokens/sec
📊 [embedding] 58 tokens in 0.05s → 1208.00 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 29462.36 tokens/sec
📊 [embedding] 496 tokens in 0.01s → 34831.40 tokens/sec
📊 [embedding] 187 tokens in 0.01s → 16074.74 tokens/sec
📊 [embedding] 490 tokens in 0.02s → 27672.50 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 30694.22 tokens/sec


🔁 Storing in ChromaDB:  57%|█████████▋       | 231/407 [00:07<00:05, 34.78it/s]

📊 [embedding] 232 tokens in 0.05s → 4396.94 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 36257.59 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 32495.34 tokens/sec
📊 [embedding] 273 tokens in 0.05s → 5594.73 tokens/sec
📊 [embedding] 491 tokens in 0.01s → 33412.89 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34435.43 tokens/sec
📊 [embedding] 411 tokens in 0.01s → 32683.51 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 35031.08 tokens/sec


🔁 Storing in ChromaDB:  59%|██████████       | 240/407 [00:07<00:05, 32.55it/s]

📊 [embedding] 469 tokens in 0.02s → 29615.62 tokens/sec
📊 [embedding] 110 tokens in 0.05s → 2317.20 tokens/sec
📊 [embedding] 421 tokens in 0.02s → 26926.74 tokens/sec
📊 [embedding] 271 tokens in 0.01s → 18232.01 tokens/sec
📊 [embedding] 235 tokens in 0.05s → 4666.41 tokens/sec


🔁 Storing in ChromaDB:  60%|██████████▏      | 244/407 [00:07<00:05, 27.99it/s]

📊 [embedding] 225 tokens in 0.05s → 4465.07 tokens/sec
📊 [embedding] 73 tokens in 0.01s → 8423.92 tokens/sec
📊 [embedding] 199 tokens in 0.05s → 3935.84 tokens/sec
📊 [embedding] 192 tokens in 0.05s → 3614.03 tokens/sec
📊 [embedding] 204 tokens in 0.01s → 14330.85 tokens/sec
📊 [embedding] 241 tokens in 0.02s → 10965.08 tokens/sec


🔁 Storing in ChromaDB:  61%|██████████▎      | 248/407 [00:08<00:05, 27.88it/s]

📊 [embedding] 41 tokens in 0.01s → 2953.43 tokens/sec
📊 [embedding] 237 tokens in 0.05s → 4358.86 tokens/sec
📊 [embedding] 230 tokens in 0.06s → 4169.07 tokens/sec
📊 [embedding] 45 tokens in 0.05s → 831.44 tokens/sec


🔁 Storing in ChromaDB:  62%|██████████▌      | 254/407 [00:08<00:05, 26.60it/s]

📊 [embedding] 164 tokens in 0.01s → 14919.87 tokens/sec
📊 [embedding] 262 tokens in 0.01s → 18001.89 tokens/sec
📊 [embedding] 249 tokens in 0.05s → 5208.44 tokens/sec
📊 [embedding] 203 tokens in 0.01s → 16125.22 tokens/sec
📊 [embedding] 397 tokens in 0.01s → 29433.45 tokens/sec
📊 [embedding] 472 tokens in 0.01s → 33702.38 tokens/sec
📊 [embedding] 39 tokens in 0.01s → 4738.09 tokens/sec


🔁 Storing in ChromaDB:  65%|██████████▉      | 263/407 [00:08<00:04, 31.89it/s]

📊 [embedding] 146 tokens in 0.05s → 3114.67 tokens/sec
📊 [embedding] 345 tokens in 0.01s → 23681.51 tokens/sec
📊 [embedding] 390 tokens in 0.02s → 23413.42 tokens/sec
📊 [embedding] 322 tokens in 0.01s → 24580.32 tokens/sec
📊 [embedding] 451 tokens in 0.01s → 30833.43 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 37151.93 tokens/sec
📊 [embedding] 54 tokens in 0.05s → 1050.13 tokens/sec


🔁 Storing in ChromaDB:  67%|███████████▎     | 271/407 [00:08<00:04, 30.66it/s]

📊 [embedding] 376 tokens in 0.02s → 21948.70 tokens/sec
📊 [embedding] 413 tokens in 0.02s → 26631.53 tokens/sec
📊 [embedding] 476 tokens in 0.01s → 35194.06 tokens/sec
📊 [embedding] 441 tokens in 0.01s → 34220.53 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34561.98 tokens/sec
📊 [embedding] 10 tokens in 0.06s → 180.31 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 36051.50 tokens/sec


🔁 Storing in ChromaDB:  68%|███████████▍     | 275/407 [00:08<00:04, 30.23it/s]

📊 [embedding] 10 tokens in 0.05s → 192.88 tokens/sec
📊 [embedding] 417 tokens in 0.01s → 27980.81 tokens/sec
📊 [embedding] 487 tokens in 0.02s → 30093.05 tokens/sec
📊 [embedding] 465 tokens in 0.02s → 30942.24 tokens/sec
📊 [embedding] 482 tokens in 0.02s → 31128.72 tokens/sec
📊 [embedding] 497 tokens in 0.01s → 40376.72 tokens/sec
📊 [embedding] 37 tokens in 0.06s → 671.17 tokens/sec


🔁 Storing in ChromaDB:  69%|███████████▊     | 282/407 [00:09<00:04, 28.90it/s]

📊 [embedding] 475 tokens in 0.02s → 29124.98 tokens/sec
📊 [embedding] 414 tokens in 0.02s → 27528.49 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 28866.91 tokens/sec
📊 [embedding] 28 tokens in 0.05s → 541.72 tokens/sec
📊 [embedding] 392 tokens in 0.02s → 25941.42 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 31623.14 tokens/sec
📊 [embedding] 398 tokens in 0.02s → 26468.36 tokens/sec


🔁 Storing in ChromaDB:  72%|████████████▏    | 292/407 [00:09<00:03, 36.31it/s]

📊 [embedding] 486 tokens in 0.02s → 29968.12 tokens/sec
📊 [embedding] 491 tokens in 0.01s → 34106.81 tokens/sec
📊 [embedding] 495 tokens in 0.01s → 37115.08 tokens/sec
📊 [embedding] 435 tokens in 0.01s → 35371.98 tokens/sec
📊 [embedding] 403 tokens in 0.01s → 33911.90 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 41525.30 tokens/sec
📊 [embedding] 17 tokens in 0.01s → 2118.34 tokens/sec
📊 [embedding] 499 tokens in 0.01s → 38757.04 tokens/sec
📊 [embedding] 7 tokens in 0.05s → 142.16 tokens/sec


🔁 Storing in ChromaDB:  74%|████████████▌    | 300/407 [00:09<00:03, 34.15it/s]

📊 [embedding] 460 tokens in 0.01s → 31364.89 tokens/sec
📊 [embedding] 351 tokens in 0.01s → 27202.53 tokens/sec
📊 [embedding] 484 tokens in 0.01s → 35937.60 tokens/sec
📊 [embedding] 387 tokens in 0.01s → 31295.34 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 40776.25 tokens/sec
📊 [embedding] 19 tokens in 0.05s → 412.10 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 31753.86 tokens/sec


🔁 Storing in ChromaDB:  75%|████████████▋    | 304/407 [00:09<00:03, 32.36it/s]

📊 [embedding] 12 tokens in 0.05s → 233.56 tokens/sec
📊 [embedding] 448 tokens in 0.02s → 28076.92 tokens/sec
📊 [embedding] 394 tokens in 0.01s → 27761.45 tokens/sec
📊 [embedding] 441 tokens in 0.01s → 29823.10 tokens/sec
📊 [embedding] 442 tokens in 0.02s → 25322.46 tokens/sec
📊 [embedding] 390 tokens in 0.01s → 30518.83 tokens/sec
📊 [embedding] 498 tokens in 0.01s → 40100.66 tokens/sec
📊 [embedding] 31 tokens in 0.05s → 666.69 tokens/sec


🔁 Storing in ChromaDB:  77%|█████████████    | 313/407 [00:10<00:02, 33.54it/s]

📊 [embedding] 482 tokens in 0.02s → 31181.05 tokens/sec
📊 [embedding] 398 tokens in 0.02s → 25762.11 tokens/sec
📊 [embedding] 438 tokens in 0.01s → 30674.66 tokens/sec
📊 [embedding] 492 tokens in 0.01s → 34577.13 tokens/sec
📊 [embedding] 455 tokens in 0.01s → 31612.39 tokens/sec
📊 [embedding] 396 tokens in 0.01s → 27308.73 tokens/sec
📊 [embedding] 495 tokens in 0.02s → 31574.97 tokens/sec
📊 [embedding] 453 tokens in 0.01s → 30678.14 tokens/sec
📊 [embedding] 490 tokens in 0.02s → 32629.62 tokens/sec


🔁 Storing in ChromaDB:  79%|█████████████▍   | 322/407 [00:10<00:02, 33.55it/s]

📊 [embedding] 500 tokens in 0.02s → 32807.98 tokens/sec
📊 [embedding] 2 tokens in 0.05s → 39.19 tokens/sec
📊 [embedding] 430 tokens in 0.02s → 28258.19 tokens/sec
📊 [embedding] 491 tokens in 0.02s → 30798.96 tokens/sec
📊 [embedding] 395 tokens in 0.01s → 26456.36 tokens/sec
📊 [embedding] 446 tokens in 0.01s → 32056.54 tokens/sec
📊 [embedding] 354 tokens in 0.01s → 28334.48 tokens/sec
📊 [embedding] 446 tokens in 0.01s → 33856.87 tokens/sec


🔁 Storing in ChromaDB:  82%|█████████████▊   | 332/407 [00:10<00:02, 37.37it/s]

📊 [embedding] 447 tokens in 0.01s → 32937.82 tokens/sec
📊 [embedding] 455 tokens in 0.01s → 31081.57 tokens/sec
📊 [embedding] 426 tokens in 0.02s → 28379.50 tokens/sec
📊 [embedding] 369 tokens in 0.01s → 27899.52 tokens/sec
📊 [embedding] 471 tokens in 0.01s → 37040.48 tokens/sec
📊 [embedding] 411 tokens in 0.01s → 29460.12 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 36052.12 tokens/sec
📊 [embedding] 9 tokens in 0.01s → 1218.37 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 36940.55 tokens/sec


🔁 Storing in ChromaDB:  84%|██████████████▏  | 340/407 [00:10<00:01, 34.01it/s]

📊 [embedding] 49 tokens in 0.05s → 1052.97 tokens/sec
📊 [embedding] 437 tokens in 0.01s → 29945.61 tokens/sec
📊 [embedding] 493 tokens in 0.02s → 32379.57 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 33029.66 tokens/sec
📊 [embedding] 27 tokens in 0.05s → 560.89 tokens/sec
📊 [embedding] 423 tokens in 0.02s → 26765.29 tokens/sec
📊 [embedding] 360 tokens in 0.01s → 24479.98 tokens/sec


🔁 Storing in ChromaDB:  86%|██████████████▌  | 349/407 [00:11<00:01, 36.42it/s]

📊 [embedding] 494 tokens in 0.01s → 38026.47 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 38461.50 tokens/sec
📊 [embedding] 16 tokens in 0.01s → 2005.52 tokens/sec
📊 [embedding] 330 tokens in 0.01s → 24619.28 tokens/sec
📊 [embedding] 487 tokens in 0.02s → 31646.54 tokens/sec
📊 [embedding] 431 tokens in 0.01s → 33887.17 tokens/sec
📊 [embedding] 437 tokens in 0.02s → 28946.33 tokens/sec
📊 [embedding] 475 tokens in 0.02s → 26820.69 tokens/sec
📊 [embedding] 498 tokens in 0.02s → 31513.28 tokens/sec


🔁 Storing in ChromaDB:  88%|██████████████▉  | 358/407 [00:11<00:01, 34.92it/s]

📊 [embedding] 12 tokens in 0.07s → 175.72 tokens/sec
📊 [embedding] 438 tokens in 0.02s → 28268.81 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 32794.13 tokens/sec
📊 [embedding] 39 tokens in 0.01s → 4756.69 tokens/sec
📊 [embedding] 422 tokens in 0.02s → 27333.32 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 31673.77 tokens/sec
📊 [embedding] 8 tokens in 0.01s → 947.44 tokens/sec
📊 [embedding] 354 tokens in 0.02s → 23366.23 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 32031.28 tokens/sec


🔁 Storing in ChromaDB:  90%|███████████████▎ | 366/407 [00:11<00:01, 30.08it/s]

📊 [embedding] 8 tokens in 0.06s → 126.43 tokens/sec
📊 [embedding] 453 tokens in 0.02s → 28023.48 tokens/sec
📊 [embedding] 379 tokens in 0.01s → 27026.44 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34103.91 tokens/sec
📊 [embedding] 52 tokens in 0.06s → 920.03 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 33634.08 tokens/sec
📊 [embedding] 27 tokens in 0.01s → 3422.47 tokens/sec


🔁 Storing in ChromaDB:  92%|███████████████▋ | 376/407 [00:11<00:00, 35.55it/s]

📊 [embedding] 439 tokens in 0.01s → 30735.97 tokens/sec
📊 [embedding] 418 tokens in 0.01s → 28913.35 tokens/sec
📊 [embedding] 481 tokens in 0.01s → 32257.69 tokens/sec
📊 [embedding] 454 tokens in 0.01s → 32819.96 tokens/sec
📊 [embedding] 309 tokens in 0.01s → 24431.00 tokens/sec
📊 [embedding] 463 tokens in 0.01s → 37374.91 tokens/sec
📊 [embedding] 464 tokens in 0.01s → 31521.82 tokens/sec
📊 [embedding] 399 tokens in 0.01s → 34266.91 tokens/sec
📊 [embedding] 387 tokens in 0.01s → 31409.19 tokens/sec


🔁 Storing in ChromaDB:  93%|███████████████▊ | 380/407 [00:12<00:00, 32.16it/s]

📊 [embedding] 487 tokens in 0.01s → 41841.66 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 39147.88 tokens/sec
📊 [embedding] 20 tokens in 0.07s → 306.03 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 29300.88 tokens/sec
📊 [embedding] 10 tokens in 0.01s → 1231.41 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 29581.93 tokens/sec
📊 [embedding] 28 tokens in 0.01s → 3130.58 tokens/sec


🔁 Storing in ChromaDB:  96%|████████████████▏| 389/407 [00:12<00:00, 34.93it/s]

📊 [embedding] 500 tokens in 0.02s → 31983.41 tokens/sec
📊 [embedding] 37 tokens in 0.01s → 4467.03 tokens/sec
📊 [embedding] 295 tokens in 0.02s → 17571.07 tokens/sec
📊 [embedding] 379 tokens in 0.01s → 26616.90 tokens/sec
📊 [embedding] 454 tokens in 0.01s → 31913.49 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 31420.36 tokens/sec
📊 [embedding] 19 tokens in 0.01s → 2377.94 tokens/sec
📊 [embedding] 470 tokens in 0.02s → 29354.82 tokens/sec
📊 [embedding] 460 tokens in 0.01s → 33951.22 tokens/sec


🔁 Storing in ChromaDB:  97%|████████████████▍| 393/407 [00:12<00:00, 35.76it/s]

📊 [embedding] 491 tokens in 0.02s → 31751.03 tokens/sec
📊 [embedding] 240 tokens in 0.07s → 3644.42 tokens/sec
📊 [embedding] 454 tokens in 0.01s → 30335.40 tokens/sec
📊 [embedding] 294 tokens in 0.01s → 20457.64 tokens/sec
📊 [embedding] 75 tokens in 0.06s → 1314.34 tokens/sec


🔁 Storing in ChromaDB:  99%|████████████████▋| 401/407 [00:12<00:00, 30.65it/s]

📊 [embedding] 386 tokens in 0.02s → 24373.37 tokens/sec
📊 [embedding] 499 tokens in 0.02s → 30182.25 tokens/sec
📊 [embedding] 457 tokens in 0.01s → 34237.08 tokens/sec
📊 [embedding] 286 tokens in 0.02s → 18859.10 tokens/sec
📊 [embedding] 166 tokens in 0.06s → 2756.93 tokens/sec


🔁 Storing in ChromaDB: 100%|█████████████████| 407/407 [00:13<00:00, 31.30it/s]

📊 [embedding] 86 tokens in 0.06s → 1463.61 tokens/sec
📊 [embedding] 273 tokens in 0.06s → 4584.67 tokens/sec
📊 [embedding] 138 tokens in 0.06s → 2397.84 tokens/sec
📊 [embedding] 4 tokens in 0.01s → 508.99 tokens/sec
📊 [embedding] 6 tokens in 0.01s → 801.61 tokens/sec
✅ Chunks stored in ChromaDB using local embeddings.





📊 [embedding] 5 tokens in 0.06s → 79.78 tokens/sec
📊 [RAG] 446 tokens in 51.23s → 8.71 tokens/sec
🤖 Mistral says:
  Emotional Intelligence (EI) refers to the ability to identify, use, understand, and manage emotions in positive ways to relieve stress, communicate effectively, empathize with others, overcome challenges, and defuse conflict. It involves self-awareness, self-regulation, motivation, empathy, and social skills (Goleman, 2011).

EI can be distinguished from Intelligence Quotient (IQ) as the latter measures cognitive abilities such as problem-solving, reasoning, and logic. EI is more concerned with emotional and interpersonal skills that affect how we interact with others and manage our own emotions effectively (Verywell Mind, 2023).

In this course, Computer Assisted Research Skills (CARS), students will not directly study EI as the primary focus. However, it is essential to note that understanding the basics of EI can provide valuable insights while conducting research and 

In [41]:
from rouge_score import rouge_scorer

# ========== Evaluate Summaries with ROUGE ==========
def evaluate_summaries_with_rouge(generated_summaries: dict, reference_summaries: dict):
    """
    Compare generated summaries to reference summaries using ROUGE scores.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    evaluation = {}

    for doc_id, generated in tqdm(generated_summaries.items(), desc="📏 Evaluating ROUGE"):
        reference = reference_summaries.get(doc_id, "")
        if not reference:
            print(f"⚠️ No reference summary found for document: {doc_id}")
            continue

        scores = scorer.score(generated, reference)
        evaluation[doc_id] = {
            metric: {
                "precision": round(scores[metric].precision, 4),
                "recall": round(scores[metric].recall, 4),
                "f1": round(scores[metric].fmeasure, 4)
            } for metric in scores
        }

    return evaluation


In [None]:
response = summarize_all_documents()
# response = rag_query("Tell me about Emotional Intelligence")

print("🤖 Mistral says:\n", response)

In [48]:
# ========== Preloaded Reference Summaries ==========
reference_summaries = {
    "Dataset summaries and citations.docx": """
    1. Numerous studies have investigated carbon accumulation and nitrogen cycling in various urban land uses including residential soils, golf courses, and home lawns.
    2. Some research emphasizes specific biophysical factors influencing soil carbon, such as:
       - Microbial processes (e.g., Shi et al., 2012),
       - Management intensity and duration (e.g., Wang et al., 2014),
       - Historical land use changes (e.g., Raccanello et al., 2011).
    3. A subset of studies analyzes regional influences on carbon sequestration, examining how soil characteristics and climatic conditions affect carbon dynamics in urban ecosystems (e.g., Selhorst & Lal, 2011; Smith et al., 2018).
    4. Comparative assessments have been conducted to evaluate differences in carbon sequestration across urban land typologies, including turfgrass systems and residential zones with varied development histories.
    5. Findings indicate that urban landscapes can serve as significant carbon sinks, occasionally surpassing the sequestration potential of rural or forested lands (Selhorst & Lal, 2011; Trammell et al., 2020).
    6. Despite their potential benefits, urban environments also contribute to greenhouse gas emissions, especially from impervious surfaces and vehicular traffic (Townsend-Small & Czimczik, 2010).
    7. Research suggests that residential lawns have a variable carbon sequestration potential, heavily influenced by climate, management practices, and underlying soil properties.
    8. In conclusion, while urban landscapes offer valuable opportunities for climate change mitigation through carbon storage, maximizing their potential requires integrated land management strategies that simultaneously limit greenhouse gas emissions.
    """
}

# ========== Example Evaluation Run ==========
generated_summaries = summarize_all_documents()
results = evaluate_summaries_with_rouge(generated_summaries, reference_summaries)
data = results
for filename, scores in data.items():
    print(f"\nROUGE Scores for: {filename}\n")
    print(f"{'Metric':<10} {'Precision':<10} {'Recall':<10} {'F1 Score':<10}")
    for metric, values in scores.items():
        print(f"{metric.upper():<10} {values['precision']:<10.4f} {values['recall']:<10.4f} {values['f1']:<10.4f}")

🧠 Summarizing files: 100%|███████████████████████| 1/1 [00:24<00:00, 24.70s/it]


📊 [RAG] 679 tokens in 24.68s → 27.51 tokens/sec


📏 Evaluating ROUGE: 100%|████████████████████████| 1/1 [00:00<00:00, 62.27it/s]


ROUGE Scores for: Dataset summaries and citations.docx

Metric     Precision  Recall     F1 Score  
ROUGE1     0.3833     0.5337     0.4462    
ROUGE2     0.1150     0.1605     0.1340    
ROUGEL     0.1674     0.2331     0.1949    





In [None]:
def summarize_all_documents(collection_name="docs", persist_dir="./chroma_storage", max_chunks=5):
    """
    Summarize a limited number of stored chunks to reduce load and return a list of summaries.
    """
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    results = collection.get(include=["documents"])
    all_chunks = results.get("documents", [])

    print(f"🧠 Summarizing up to {max_chunks} chunks...")
    summaries = []
    for idx, chunk in enumerate(all_chunks[:max_chunks]):
        if isinstance(chunk, str) and chunk.strip():
            summary = ask_mistral("Summarize the following document chunk.", chunk)
            summaries.append(summary)

    return summaries

In [None]:
from sentence_transformers import SentenceTransformer

# Load once
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_texts_local(texts):
    """Embed text using a local SentenceTransformer model."""
    return embed_model.encode(texts).tolist()


In [None]:
import chromadb
import pandas as pd

def store_df_in_chromadb_local(df, persist_dir="./chroma_storage", collection_name="drx_docs"):
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    for idx, row in df.iterrows():
        doc_id = f"{row['file_name']}_{row['page_number']}_{row['chunk_number']}"
        try:
            embedding = embed_texts_local([row['text']])[0]
            collection.add(
                documents=[row['text']],
                ids=[doc_id],
                metadatas=[{
                    "file_name": row['file_name'],
                    "page": row['page_number'],
                    "chunk": row['chunk_number']
                }],
                embeddings=[embedding]
            )
        except Exception as e:
            print(f"❌ Error at row {idx}: {e}")
    print("✅ Chunks stored in ChromaDB using local embeddings.")


In [None]:
def query_chromadb_local(question, collection_name="drx_docs", persist_dir="./chroma_storage", top_k=1):
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    query_vector = embed_texts_local([question])[0]
    results = collection.query(query_embeddings=[query_vector], n_results=top_k)
    # print(results)

    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
        print(f"📄 {meta['file_name']} (Page {meta['page']}, Chunk {meta['chunk']})")
        print(doc)
        print("------")


In [None]:
import ollama

def ask_mistral(question, context):
    prompt = f"""Answer the question based on the context below.

Context:
{context}

Question:
{question}

Answer:"""

    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['message']['content']


In [None]:
def rag_query(question, collection_name="drx_docs", persist_dir="./chroma_storage", top_k=5):
    # Embed the question
    query_vector = embed_texts_local([question])[0]

    # Query ChromaDB
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)
    results = collection.query(query_embeddings=[query_vector], n_results=top_k)

    # Concatenate top chunks
    retrieved_chunks = results["documents"][0]
    context = "\n\n".join(retrieved_chunks)

    # Generate answer with Ollama Mistral
    answer = ask_mistral(question, context)
    return answer


In [None]:
response = rag_query("summarize all papers?")
print("🤖 Mistral says:\n", response)