In [8]:
import os
import pandas as pd
import tiktoken
import fitz  # PyMuPDF
import docx
import csv

tokenizer = tiktoken.get_encoding("cl100k_base")
max_tokens_per_chunk = 500

def chunk_text(text, max_tokens=500):
    tokens = tokenizer.encode(text)
    return [tokenizer.decode(tokens[i:i + max_tokens]) for i in range(0, len(tokens), max_tokens)]

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    return [page.get_text() for page in doc]

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return ["\n".join(para.text for para in doc.paragraphs)]

def extract_text_from_csv(file_path):
    with open(file_path, encoding="utf-8") as f:
        return [f.read()]

def extract_text_from_excel(file_path):
    df = pd.read_excel(file_path, sheet_name=None)
    pages = []
    for sheet, data in df.items():
        pages.append(f"Sheet: {sheet}\n" + data.to_string(index=False))
    return pages

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    elif ext in [".csv"]:
        return extract_text_from_csv(file_path)
    elif ext in [".xls", ".xlsx", ".xlsm"]:
        return extract_text_from_excel(file_path)
    else:
        print(f"Unsupported file type: {ext}")
        return []

def process_folder(folder_path):
    all_chunks = []

    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if not os.path.isfile(file_path):
            continue

        try:
            pages = extract_text(file_path)
            for page_num, text in enumerate(pages, 1):
                chunks = chunk_text(text)
                for chunk_num, chunk in enumerate(chunks, 1):
                    all_chunks.append({
                        "file_name": filename,
                        "page_number": page_num,
                        "chunk_number": chunk_num,
                        "text": chunk
                    })
        except Exception as e:
            print(f"Error processing {filename}: {e}")

    return pd.DataFrame(all_chunks)

# 🔍 Usage
folder_path = "/Users/yasir/Desktop/Project/Dr.X Files"  # or your local folder path
df = process_folder(folder_path)
print(df.head())
df.to_csv("all_chunked_text.csv", index=False)


  warn(f"Print area cannot be set to Defined name: {defn.value}.")


                              file_name  page_number  chunk_number  \
0  Dataset summaries and citations.docx            1             1   
1  Dataset summaries and citations.docx            1             2   
2  Dataset summaries and citations.docx            1             3   
3  Dataset summaries and citations.docx            1             4   
4    Ocean_ecogeochemistry_A_review.pdf            1             1   

                                                text  
0  Table 1. Description of studies included in th...  
1  bock, Texas. Agronomy Journal, 112(1), 148–157...  
2  2010). Soil Organic Carbon Input from Urban Tu...  
3   (2018). Soil carbon and nitrogen accumulation...  
4  327\nOceanography and Marine Biology: An Annua...  


  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn(msg)


In [11]:
import os
import time
import pandas as pd
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import tiktoken
import ollama

# ========== Embedding Setup ==========
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
tokenizer = tiktoken.get_encoding("cl100k_base")
performance_log = []

# ========== Performance Logger ==========
def log_performance(stage, text_or_token_list, start_time, end_time):
    if isinstance(text_or_token_list, list) and isinstance(text_or_token_list[0], int):
        total_tokens = len(text_or_token_list)
    else:
        total_tokens = len(tokenizer.encode(text_or_token_list))

    elapsed = end_time - start_time
    tokens_per_sec = total_tokens / elapsed if elapsed > 0 else 0

    log_entry = {
        "stage": stage,
        "tokens": total_tokens,
        "duration_sec": round(elapsed, 4),
        "tokens_per_sec": round(tokens_per_sec, 2),
        "timestamp": pd.Timestamp.now()
    }

    performance_log.append(log_entry)
    print(f"📊 [{stage}] {total_tokens} tokens in {elapsed:.2f}s → {tokens_per_sec:.2f} tokens/sec")

    return tokens_per_sec

def export_log(filename="performance_log.csv"):
    df = pd.DataFrame(performance_log)
    df.to_csv(filename, index=False)
    print(f"✅ Performance log saved to {filename}")

def summarize_performance():
    df = pd.DataFrame(performance_log)
    if df.empty:
        print("⚠️ No performance data to summarize.")
        return
    summary = df.groupby("stage")["tokens_per_sec"].agg(["count", "min", "max", "mean"]).reset_index()
    print("\n📈 Performance Summary:")
    print(summary.to_string(index=False))
    return summary

# ========== Embedding Function ==========
def embed_texts_local(texts):
    start = time.time()
    embeddings = embed_model.encode(texts).tolist()
    end = time.time()
    log_performance("embedding", " ".join(texts), start, end)
    return embeddings

# ========== Store to ChromaDB ==========
def store_df_in_chromadb_local(df, persist_dir="./chroma_storage", collection_name="docs"):
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    for idx, row in df.iterrows():
        doc_id = f"{row['file_name']}_{row['page_number']}_{row['chunk_number']}"
        try:
            embedding = embed_texts_local([row['text']])[0]
            collection.add(
                documents=[row['text']],
                ids=[doc_id],
                metadatas=[{
                    "file_name": row['file_name'],
                    "page": row['page_number'],
                    "chunk": row['chunk_number']
                }],
                embeddings=[embedding]
            )
        except Exception as e:
            print(f"❌ Error at row {idx}: {e}")
    print("✅ Chunks stored in ChromaDB using local embeddings.")

# ========== Ask Mistral via Ollama ==========
def ask_mistral(question, context):
    prompt = f"""Answer the question based on the context below.

Context:
{context}

Question:
{question}

Answer:"""
    start = time.time()
    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}]
    )
    end = time.time()
    log_performance("RAG", prompt, start, end)
    return response['message']['content']

# ========== RAG Pipeline ==========
def rag_query(question, collection_name="docs", persist_dir="./chroma_storage", top_k=5):
    query_vector = embed_texts_local([question])[0]

    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)
    results = collection.query(query_embeddings=[query_vector], n_results=top_k)

    retrieved_chunks = results["documents"][0]
    context = "\n\n".join(retrieved_chunks)

    return ask_mistral(question, context)


📊 [embedding] 500 tokens in 0.13s → 3798.59 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34050.76 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 35203.65 tokens/sec
📊 [embedding] 333 tokens in 0.01s → 25888.37 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 38928.42 tokens/sec
📊 [embedding] 249 tokens in 0.04s → 5739.21 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 30051.19 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 36352.72 tokens/sec
📊 [embedding] 143 tokens in 0.04s → 3604.46 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34813.86 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34578.51 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 33774.37 tokens/sec
📊 [embedding] 3 tokens in 0.04s → 70.55 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34611.04 tokens/sec
📊 [embedding] 500 tokens in 0.02s → 31523.80 tokens/sec
📊 [embedding] 185 tokens in 0.04s → 4197.19 tokens/sec
📊 [embedding] 500 tokens in 0.01s → 34328.89 tokens/sec
📊 [embedding] 231 tokens in 0.04s → 5993.01 tokens/sec
📊 

In [12]:
response = rag_query("summarize all papers?")
print("🤖 Mistral says:\n", response)

📊 [embedding] 6 tokens in 0.15s → 39.19 tokens/sec
📊 [RAG] 1685 tokens in 106.13s → 15.88 tokens/sec
🤖 Mistral says:
  Here is a summary of the papers cited in the provided text:

1. Polunin et al., (2001) - The study's title and details are not provided, but it is assumed to be related to stress.
2. Stuck et al., (2001) - Again, no specifics about the study are given, but it is presumably related to stress as well.
3. Devenport & Bax, (2002); Hoekstra et al., (2002, 2003); Nyssen et al., (2002); Sato et al., (2002) - These four studies are not specified, but they are likely to be related to stress and coping mechanisms.
4. Schlitzer (2002); Smith et al. (2002); Bode et al. (2003, 2004, 2007); Das et al. (2003) - These studies do not have titles or specific details provided, but they are likely to be related to stress.
5. Estrada et al. (2003); Jennings & warr (2003); Kang et al. (2003); mcclelland et al. (2003); Quay et al. (2003); Schmidt et al. (2003); corbisier et al. (2004); mahaf

In [2]:
from sentence_transformers import SentenceTransformer

# Load once
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_texts_local(texts):
    """Embed text using a local SentenceTransformer model."""
    return embed_model.encode(texts).tolist()


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import chromadb
import pandas as pd

def store_df_in_chromadb_local(df, persist_dir="./chroma_storage", collection_name="drx_docs"):
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    for idx, row in df.iterrows():
        doc_id = f"{row['file_name']}_{row['page_number']}_{row['chunk_number']}"
        try:
            embedding = embed_texts_local([row['text']])[0]
            collection.add(
                documents=[row['text']],
                ids=[doc_id],
                metadatas=[{
                    "file_name": row['file_name'],
                    "page": row['page_number'],
                    "chunk": row['chunk_number']
                }],
                embeddings=[embedding]
            )
        except Exception as e:
            print(f"❌ Error at row {idx}: {e}")
    print("✅ Chunks stored in ChromaDB using local embeddings.")


In [4]:
def query_chromadb_local(question, collection_name="drx_docs", persist_dir="./chroma_storage", top_k=1):
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    query_vector = embed_texts_local([question])[0]
    results = collection.query(query_embeddings=[query_vector], n_results=top_k)
    # print(results)

    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
        print(f"📄 {meta['file_name']} (Page {meta['page']}, Chunk {meta['chunk']})")
        print(doc)
        print("------")


In [5]:
import ollama

def ask_mistral(question, context):
    prompt = f"""Answer the question based on the context below.

Context:
{context}

Question:
{question}

Answer:"""

    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['message']['content']


In [6]:
def rag_query(question, collection_name="drx_docs", persist_dir="./chroma_storage", top_k=5):
    # Embed the question
    query_vector = embed_texts_local([question])[0]

    # Query ChromaDB
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)
    results = collection.query(query_embeddings=[query_vector], n_results=top_k)

    # Concatenate top chunks
    retrieved_chunks = results["documents"][0]
    context = "\n\n".join(retrieved_chunks)

    # Generate answer with Ollama Mistral
    answer = ask_mistral(question, context)
    return answer


In [7]:
response = rag_query("summarize all papers?")
print("🤖 Mistral says:\n", response)

🤖 Mistral says:
  This summary provides an overview of the papers cited in the course materials for the Stress Management and Paulo Coelho's books:

* Polunin et al., Stuck et al., Devenport & Bax, Hoekstra et al. (2002, 2003), Nyssen et al., Sato et al., Schlitzer, Smith et al., Bode et al. (2003, 2004, 2007), Das et al., Estrada et al., Jennings & Warr, Kang et al., mcclelland et al., Quay et al., Schmidt et al., corbisier et al., mahaffey et al. (2004), abed-Navandi & Dworschak (2005), Iken et al., Kiriakoulakis et al. (2005), le loc'h & Hily (2005), Quillfeldt et al. (2005), Sommer et al. (2005), galimov et al. (2006), goni et al. (2006), Tamelander et al. (2006), carlier et al. (2007), Holl et al. (2007), cianco et al. (2008), Harmelin-vivien et al. (2008), lamb & Swart (2008), le loc'h et al. (2008), Petursdottir et al. (2008, 2010), Fanelli et al. (2009, 2011), Frederich et al. (2009), Hirch (2009), lysiak (2009), Richoux & Froneman (2009), laakmann & auel (2010), miller et al. 