In [1]:
import os
import pandas as pd
import tiktoken
import fitz  # PyMuPDF
import docx
import csv
from tqdm import tqdm  # 👈 import tqdm

tokenizer = tiktoken.get_encoding("cl100k_base")
max_tokens_per_chunk = 500

def chunk_text(text, max_tokens=500):
    tokens = tokenizer.encode(text)
    return [tokenizer.decode(tokens[i:i + max_tokens]) for i in range(0, len(tokens), max_tokens)]

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    return [page.get_text() for page in doc]

def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return ["\n".join(para.text for para in doc.paragraphs)]

def extract_text_from_csv(file_path):
    with open(file_path, encoding="utf-8") as f:
        return [f.read()]

def extract_text_from_excel(file_path):
    df = pd.read_excel(file_path, sheet_name=None)
    pages = []
    for sheet, data in df.items():
        pages.append(f"Sheet: {sheet}\n" + data.to_string(index=False))
    return pages

def extract_text(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == ".pdf":
        return extract_text_from_pdf(file_path)
    elif ext == ".docx":
        return extract_text_from_docx(file_path)
    elif ext in [".csv"]:
        return extract_text_from_csv(file_path)
    elif ext in [".xls", ".xlsx", ".xlsm"]:
        return extract_text_from_excel(file_path)
    else:
        print(f"Unsupported file type: {ext}")
        return []

def process_folder(folder_path):
    all_chunks = []
    file_list = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

    for filename in tqdm(file_list, desc="📄 Processing files"):
        file_path = os.path.join(folder_path, filename)

        try:
            pages = extract_text(file_path)
            for page_num, text in enumerate(pages, 1):
                chunks = chunk_text(text)
                for chunk_num, chunk in enumerate(chunks, 1):
                    all_chunks.append({
                        "file_name": filename,
                        "page_number": page_num,
                        "chunk_number": chunk_num,
                        "text": chunk
                    })
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")

    return pd.DataFrame(all_chunks)

# 🔍 Usage
folder_path = "/Users/yasir/Desktop/Project/Dr.X Files"
df = process_folder(folder_path)
print(df.head())
df.to_csv("all_chunked_text.csv", index=False)


  warn(f"Print area cannot be set to Defined name: {defn.value}.")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn(msg)
📄 Processing files: 100%|██████████████████████| 10/10 [00:00<00:00, 12.88it/s]

                              file_name  page_number  chunk_number  \
0  Dataset summaries and citations.docx            1             1   
1  Dataset summaries and citations.docx            1             2   
2  Dataset summaries and citations.docx            1             3   
3  Dataset summaries and citations.docx            1             4   
4    Ocean_ecogeochemistry_A_review.pdf            1             1   

                                                text  
0  Table 1. Description of studies included in th...  
1  bock, Texas. Agronomy Journal, 112(1), 148–157...  
2  2010). Soil Organic Carbon Input from Urban Tu...  
3   (2018). Soil carbon and nitrogen accumulation...  
4  327\nOceanography and Marine Biology: An Annua...  





In [2]:
import os
import time
import pandas as pd
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
import tiktoken
import ollama
import re
from tqdm import tqdm  # ✅ Added tqdm for progress bars

# ========== Embedding Setup ==========
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
tokenizer = tiktoken.get_encoding("cl100k_base")
performance_log = []

# ========== Performance Logger ==========
def log_performance(stage, text_or_token_list, start_time, end_time):
    if isinstance(text_or_token_list, list) and isinstance(text_or_token_list[0], int):
        total_tokens = len(text_or_token_list)
    else:
        total_tokens = len(tokenizer.encode(text_or_token_list))

    elapsed = end_time - start_time
    tokens_per_sec = total_tokens / elapsed if elapsed > 0 else 0

    log_entry = {
        "stage": stage,
        "tokens": total_tokens,
        "duration_sec": round(elapsed, 4),
        "tokens_per_sec": round(tokens_per_sec, 2),
        "timestamp": pd.Timestamp.now()
    }

    performance_log.append(log_entry)
    print(f"📊 [{stage}] {total_tokens} tokens in {elapsed:.2f}s → {tokens_per_sec:.2f} tokens/sec")

    return tokens_per_sec

def export_log(filename="performance_log.csv"):
    df = pd.DataFrame(performance_log)
    df.to_csv(filename, index=False)
    print(f"✅ Performance log saved to {filename}")

def summarize_performance():
    df = pd.DataFrame(performance_log)
    if df.empty:
        print("⚠️ No performance data to summarize.")
        return
    summary = df.groupby("stage")["tokens_per_sec"].agg(["count", "min", "max", "mean"]).reset_index()
    print("\n📈 Performance Summary:")
    print(summary.to_string(index=False))
    return summary

# ========== Preprocessing DataFrame ==========
def remove_tables_and_numeric_lines(text):
    lines = text.splitlines()
    filtered_lines = []
    tables_and_numbers = []

    for line in lines:
        stripped = line.strip().lower()

        # Skip short chemical/symbol lines like "CO", "⇌", "+", etc.
        if len(line.strip()) < 10 and re.match(r"^[A-Za-z0-9()+−⇌\\s]*$", line):
            tables_and_numbers.append(line)
            continue
        # Remove lines like '.348136          NaN                                                              NaN'
        if re.fullmatch(r"[.\d\s]+nan\s*nan", line.strip().lower()):
            tables_and_numbers.append(line)
            continue
        # Remove lines like '.348136          NaN    NaN' or 'NaN                  0'
        if re.fullmatch(r"[.\d\s]*nan\s*\d*", line.strip().lower()):
            tables_and_numbers.append(line)
            continue


        # Skip lines that are too short and contain mostly numbers
        if len(line.strip()) < 40 and sum(c.isdigit() for c in line) / max(len(line), 1) > 0.3:
            tables_and_numbers.append(line)
            continue
        stripped = line.strip().lower()

        numbers = re.findall(r"\d", line)
        if len(numbers) / max(len(line), 1) > 0.4:
            tables_and_numbers.append(line)
            continue

        if re.search(r"\t", line) or re.search(r"\|", line):
            tables_and_numbers.append(line)
            continue

        if len(re.findall(r"\s{2,}", line)) > 2:
            tables_and_numbers.append(line)
            continue

        if any(keyword in stripped for keyword in ["author", "editor", "publisher", "published by", "phone", "email", "contact", "address"]):
            continue

        if re.search(r"\b\w+@\w+\.\w+\b", line):
            continue
        if re.search(r"\+?\d[\d\s().-]{7,}\d", line):
            continue

        filtered_lines.append(line)

    return "\n".join(filtered_lines), "\n".join(tables_and_numbers)
    
# ========== DataFrame Cleaning ==========
def clean_dataframe_text(df):
    df = df.dropna(subset=['text'])  # 🧹 Drop rows where 'text' is NaN
    df = df[df['text'].str.strip().astype(bool)]  # 🧹 Drop rows where 'text' is empty or only whitespace
    df = df[df['text'].str.len() >= 10]  # 🧹 Drop rows where 'text' is too short
    df = df.dropna(subset=['text'])  # 🧹 Drop rows where 'text' is NaN
    df = df.dropna(subset=['text'])  # 🧹 Drop rows where 'text' is NaN
    df = df[df['text'].str.strip().astype(bool)]  # 🧹 Drop rows where 'text' is empty or only whitespace
    df = df[df['text'].str.strip().astype(bool)]  # 🧹 Drop rows where 'text' is empty or only whitespace
    cleaned_rows = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc="🧹 Cleaning text in DataFrame"):
        cleaned_text, _ = remove_tables_and_numeric_lines(row['text'])
        cleaned_rows.append({
            "file_name": row['file_name'],
            "page_number": row['page_number'],
            "chunk_number": row['chunk_number'],
            "text": cleaned_text
        })
        cleaned_df = pd.DataFrame(cleaned_rows)
    cleaned_df = cleaned_df[cleaned_df['text'].str.strip().astype(bool)]  # 🧹 Drop rows with empty cleaned text
    return cleaned_df



# ========== Embedding Function ==========
def embed_texts_local(texts):
    start = time.time()
    embeddings = embed_model.encode(texts).tolist()
    end = time.time()
    log_performance("embedding", " ".join(texts), start, end)
    return embeddings

# ========== Store to ChromaDB ==========
def store_df_in_chromadb_local(df, persist_dir="./chroma_storage", collection_name="cleaned_docs"):
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="🔁 Storing in ChromaDB"):
        doc_id = f"{row['file_name']}_{row['page_number']}_{row['chunk_number']}"
        try:
            embedding = embed_texts_local([row['text']])[0]
            collection.add(
                documents=[row['text']],
                ids=[doc_id],
                metadatas=[{
                    "file_name": row['file_name'],
                    "page": row['page_number'],
                    "chunk": row['chunk_number']
                }],
                embeddings=[embedding]
            )
        except Exception as e:
            print(f"❌ Error at row {idx}: {e}")
    print("✅ Chunks stored in ChromaDB using local embeddings.")

# ========== Ask Mistral via Ollama ==========
def ask_mistral(question, context):
    prompt = f""""Write a clear, formal, human-like summary in academic tone.
    
Context:
{context}

Question:
{question}

Answer:"""
    start = time.time()
    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}]
    )
    end = time.time()
    log_performance("RAG", prompt, start, end)
    return response['message']['content']



# ========== RAG Pipeline ==========
def rag_query(question, collection_name="cleaned_docs", persist_dir="./chroma_storage", top_k=1):
    query_vector = embed_texts_local([question])[0]

    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)
    results = collection.query(query_embeddings=[query_vector], n_results=top_k)

    retrieved_chunks = results["documents"][0]
    context = "\n\n".join(retrieved_chunks)

    return ask_mistral(question, context)

# ========== Summarize All Documents ==========
def summarize_all_documents(collection_name="cleaned_docs", persist_dir="./chroma_storage"):
    """
    Summarize all chunks grouped by document ID.
    """
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    results = collection.get(include=["documents", "metadatas"])
    summaries_by_file = {}

    for doc, meta in zip(results["documents"], results["metadatas"]):
        file_name = meta.get("file_name", "unknown")
        text = doc if isinstance(doc, str) else doc[0]
        if file_name not in summaries_by_file:
            summaries_by_file[file_name] = []
        summaries_by_file[file_name].append(text)

    final_summaries = {}
    for file, chunks in tqdm(list(summaries_by_file.items())[:1], desc="🧠 Summarizing files"):
        combined_text = "\n\n".join(chunks)
        question = """
        - Summarize using concise **bullet points** or **brief paragraphs**.
        - Emphasize the **most important aspects**.
        - Maintain an **academic tone** and ensure **clarity** throughout.
        - Avoid unnecessary details while capturing the **core message**.
        """
        summary = ask_mistral(question, combined_text)
        final_summaries[file] = summary



    return final_summaries
#  # uncomment these two lines to store data in chromadb
# clean_df= clean_dataframe_text(df) 
# store_df_in_chromadb_local(clean_df)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from rouge_score import rouge_scorer

# ========== Evaluate Summaries with ROUGE ==========
def evaluate_summaries_with_rouge(generated_summaries: dict, reference_summaries: dict):
    """
    Compare generated summaries to reference summaries using ROUGE scores.
    """
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    evaluation = {}

    for doc_id, generated in tqdm(generated_summaries.items(), desc="📏 Evaluating ROUGE"):
        reference = reference_summaries.get(doc_id, "")
        if not reference:
            print(f"⚠️ No reference summary found for document: {doc_id}")
            continue

        scores = scorer.score(generated, reference)
        evaluation[doc_id] = {
            metric: {
                "precision": round(scores[metric].precision, 4),
                "recall": round(scores[metric].recall, 4),
                "f1": round(scores[metric].fmeasure, 4)
            } for metric in scores
        }

    return evaluation


In [4]:
# ========== Preloaded Reference Summaries ==========
reference_summaries = {
    "Dataset summaries and citations.docx": """
    1. Numerous studies have investigated carbon accumulation and nitrogen cycling in various urban land uses including residential soils, golf courses, and home lawns.
    2. Some research emphasizes specific biophysical factors influencing soil carbon, such as:
       - Microbial processes (e.g., Shi et al., 2012),
       - Management intensity and duration (e.g., Wang et al., 2014),
       - Historical land use changes (e.g., Raccanello et al., 2011).
    3. A subset of studies analyzes regional influences on carbon sequestration, examining how soil characteristics and climatic conditions affect carbon dynamics in urban ecosystems (e.g., Selhorst & Lal, 2011; Smith et al., 2018).
    4. Comparative assessments have been conducted to evaluate differences in carbon sequestration across urban land typologies, including turfgrass systems and residential zones with varied development histories.
    5. Findings indicate that urban landscapes can serve as significant carbon sinks, occasionally surpassing the sequestration potential of rural or forested lands (Selhorst & Lal, 2011; Trammell et al., 2020).
    6. Despite their potential benefits, urban environments also contribute to greenhouse gas emissions, especially from impervious surfaces and vehicular traffic (Townsend-Small & Czimczik, 2010).
    7. Research suggests that residential lawns have a variable carbon sequestration potential, heavily influenced by climate, management practices, and underlying soil properties.
    8. In conclusion, while urban landscapes offer valuable opportunities for climate change mitigation through carbon storage, maximizing their potential requires integrated land management strategies that simultaneously limit greenhouse gas emissions.
    """
}


In [9]:
import datetime
import uuid

# 🔑 Generate or retrieve session ID
session_id = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + str(uuid.uuid4())[:8]

persist_dir="./chroma_storage"
chroma_client = chromadb.PersistentClient(path=persist_dir)
collection_chat_history = chroma_client.get_or_create_collection("chat_history")

def get_full_chat_history(session_id):
    
    results = collection_chat_history.get(
        where={"session_id": session_id},
        include=["documents", "metadatas"]
    )
    documents = results.get("documents", [])
    # Sort by timestamp if available
    metadatas = results.get("metadatas", [])
    if metadatas and "timestamp" in metadatas[0]:
        combined = sorted(zip(documents, metadatas), key=lambda x: x[1]["timestamp"])
        return [doc for doc, _ in combined]
    return documents


def store_chat_turn(session_id, user_msg, bot_msg):
    full_turn = f"User: {user_msg}\nAssistant: {bot_msg}"
    embedding = embed_model.encode(full_turn).tolist()
    doc_id = str(uuid.uuid4())
    collection_chat_history.add(
        documents=[full_turn],
        embeddings=[embedding],
        ids=[doc_id],
        metadatas=[{
            "session_id": session_id,
            "timestamp": datetime.datetime.now().isoformat()
        }]
    )
def get_chat_history(session_id, top_k=5):
    results = collection_chat_history.query(
        query_texts=["latest topic continuation"],  # or current user query
        n_results=top_k,
        where={"session_id": session_id}
    )
    return results["documents"][0]  # list of retrieved turns


In [17]:

# 🧠 Retrieve full session chat
chat_history_snippets = get_full_chat_history(session_id)
full_context = "\n\n".join(chat_history_snippets)
question = """
        - Summarize using concise **bullet points** or **brief paragraphs**.
        - Emphasize the **most important aspects**.
        - Maintain an **academic tone** and ensure **clarity** throughout.
        - Avoid unnecessary details while capturing the **core message**.
        """
# 🤖 Generate refined summary
prompt = f"{full_context}\nUser: {question}\nAssistant:"
response = ask_mistral(question, prompt)
print(prompt)


# store chat history
store_chat_turn(session_id,question, response)

# 📝 Wrap the response for ROUGE evaluation
filename = "Dataset summaries and citations.docx"  # or dynamically match the file
refined_summary = {filename: response}

# 📊 Evaluate the new summary
rouge_result = evaluate_summaries_with_rouge(refined_summary, reference_summaries)

# 📢 Display
print(f"\n📊 ROUGE Evaluation for Final Refined Summary:")
for metric, values in rouge_result[filename].items():
    print(f"{metric.upper():<10} Precision: {values['precision']:<10.4f} Recall: {values['recall']:<10.4f} F1: {values['f1']:<10.4f}")


📊 [RAG] 1848 tokens in 42.88s → 43.10 tokens/sec
User: Summarize all documents
Assistant:  The summarized account of the provided documents indicates a comprehensive investigation into the effects of climate change on various ecosystems and potential solutions to mitigate its impact. The documents are categorized as follows:

1. Document A discusses the significance of mangroves in carbon sequestration and their role in reducing greenhouse gas emissions, emphasizing their importance in maintaining coastal ecosystem health and combating climate change.

2. Document B examines the effects of increasing temperatures on terrestrial ecosystems, focusing on changes in species distribution patterns, shifts in plant phenology, and potential risks to biodiversity due to habitat destruction.

3. Document C explores renewable energy technologies as a means of reducing fossil fuel consumption, discussing their applications, advantages, challenges, and potential for widespread adoption in mitigatin

📏 Evaluating ROUGE: 100%|████████████████████████| 1/1 [00:00<00:00, 49.46it/s]


📊 ROUGE Evaluation for Final Refined Summary:
ROUGE1     Precision: 0.2423     Recall: 0.3716     F1: 0.2933    
ROUGE2     Precision: 0.0310     Recall: 0.0476     F1: 0.0375    
ROUGEL     Precision: 0.1101     Recall: 0.1689     F1: 0.1333    





In [None]:
# ----------------xxxxxxxxxxxx-----------------

In [8]:

# ========== Example Evaluation Run ==========


generated_summaries = summarize_all_documents()

results = evaluate_summaries_with_rouge(res, reference_summaries)
question= "Summarize all documents"
store_chat_turn(session_id,question, generated_summaries)
for filename, scores in results.items():
    print(f"\nROUGE Scores for: {filename}\n")
    print(f"{'Metric':<10} {'Precision':<10} {'Recall':<10} {'F1 Score':<10}")
    for metric, values in scores.items():
        print(f"{metric.upper():<10} {values['precision']:<10.4f} {values['recall']:<10.4f} {values['f1']:<10.4f}")
context_snippets = get_chat_history(session_id)
full_context = "\n".join(context_snippets)
prompt = f"{full_context}\nUser: {question}\nAssistant:"
# res = ask_mistral(question, prompt)


In [None]:
def summarize_all_documents(collection_name="docs", persist_dir="./chroma_storage", max_chunks=5):
    """
    Summarize a limited number of stored chunks to reduce load and return a list of summaries.
    """
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    results = collection.get(include=["documents"])
    all_chunks = results.get("documents", [])

    print(f"🧠 Summarizing up to {max_chunks} chunks...")
    summaries = []
    for idx, chunk in enumerate(all_chunks[:max_chunks]):
        if isinstance(chunk, str) and chunk.strip():
            summary = ask_mistral("Summarize the following document chunk.", chunk)
            summaries.append(summary)

    return summaries

In [None]:
from sentence_transformers import SentenceTransformer

# Load once
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_texts_local(texts):
    """Embed text using a local SentenceTransformer model."""
    return embed_model.encode(texts).tolist()


In [None]:
import chromadb
import pandas as pd

def store_df_in_chromadb_local(df, persist_dir="./chroma_storage", collection_name="drx_docs"):
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    for idx, row in df.iterrows():
        doc_id = f"{row['file_name']}_{row['page_number']}_{row['chunk_number']}"
        try:
            embedding = embed_texts_local([row['text']])[0]
            collection.add(
                documents=[row['text']],
                ids=[doc_id],
                metadatas=[{
                    "file_name": row['file_name'],
                    "page": row['page_number'],
                    "chunk": row['chunk_number']
                }],
                embeddings=[embedding]
            )
        except Exception as e:
            print(f"❌ Error at row {idx}: {e}")
    print("✅ Chunks stored in ChromaDB using local embeddings.")


In [None]:
def query_chromadb_local(question, collection_name="drx_docs", persist_dir="./chroma_storage", top_k=1):
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)

    query_vector = embed_texts_local([question])[0]
    results = collection.query(query_embeddings=[query_vector], n_results=top_k)
    # print(results)

    for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
        print(f"📄 {meta['file_name']} (Page {meta['page']}, Chunk {meta['chunk']})")
        print(doc)
        print("------")


In [None]:
import ollama

def ask_mistral(question, context):
    prompt = f"""Answer the question based on the context below.

Context:
{context}

Question:
{question}

Answer:"""

    response = ollama.chat(
        model="mistral",
        messages=[{"role": "user", "content": prompt}]
    )
    return response['message']['content']


In [None]:
def rag_query(question, collection_name="drx_docs", persist_dir="./chroma_storage", top_k=5):
    # Embed the question
    query_vector = embed_texts_local([question])[0]

    # Query ChromaDB
    chroma_client = chromadb.PersistentClient(path=persist_dir)
    collection = chroma_client.get_or_create_collection(name=collection_name)
    results = collection.query(query_embeddings=[query_vector], n_results=top_k)

    # Concatenate top chunks
    retrieved_chunks = results["documents"][0]
    context = "\n\n".join(retrieved_chunks)

    # Generate answer with Ollama Mistral
    answer = ask_mistral(question, context)
    return answer


In [None]:
response = rag_query("summarize all papers?")
print("🤖 Mistral says:\n", response)