In [None]:
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Download & save MiniLM
model = SentenceTransformer('all-MiniLM-L6-v2')
model.save('chatbot/models/minilm')

# Download & save MPNet
model = SentenceTransformer('all-mpnet-base-v2')
model.save('chatbot/models/mpnet')

# Download & save T5-small
t5 = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
t5.save_pretrained('chatbot/models/t5_model')
tokenizer.save_pretrained('chatbot/models/t5_model')


In [4]:
import os

def list_directory_tree_with_os_walk(starting_directory):
    for root, directories, files in os.walk(starting_directory):
        print(f"Directory: {root}")
        for file in files:
            print(f"  File: {file}")

In [5]:
list_directory_tree_with_os_walk('.')

Directory: .
  File: AeroViz.py
  File: foo.ipynb
  File: icon.png
  File: outstanding.txt
  File: README.md
  File: requirements.txt
  File: runtime.txt
  File: stylesheet.py
Directory: .\.streamlit
  File: config.toml
Directory: .\chatbot
  File: retrieval.py
  File: summarizer.py
  File: __init__.py
Directory: .\chatbot\data
  File: embedded_papers.json
  File: faiss.index
Directory: .\chatbot\models
Directory: .\chatbot\models\minilm
  File: config.json
  File: config_sentence_transformers.json
  File: model.safetensors
  File: modules.json
  File: README.md
  File: sentence_bert_config.json
  File: special_tokens_map.json
  File: tokenizer.json
  File: tokenizer_config.json
  File: vocab.txt
Directory: .\chatbot\models\minilm\1_Pooling
  File: config.json
Directory: .\chatbot\models\minilm\2_Normalize
Directory: .\chatbot\models\mpnet
  File: config.json
  File: config_sentence_transformers.json
  File: model.safetensors
  File: modules.json
  File: README.md
  File: sentence_bert

In [16]:
import os
import json
import faiss
import fitz  # PyMuPDF
import numpy as np
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import nltk
nltk.download('punkt')

# --- Constants --- #
MODEL_PATH = "chatbot/models/minilm"
RAW_BASE_DIR = "chatbot/papers_raw"
OUT_BASE_DIR = "chatbot/data"

# --- Helpers --- #
def chunk_text(text, chunk_size=5):
    sentences = sent_tokenize(text)
    return [" ".join(sentences[i:i+chunk_size]) for i in range(0, len(sentences), chunk_size)]

def pdf_to_text(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

# --- Load Model --- #
embedder = SentenceTransformer(MODEL_PATH)

# --- Process Each Topic Folder --- #
for topic in os.listdir(RAW_BASE_DIR):
    topic_dir = os.path.join(RAW_BASE_DIR, topic)
    if not os.path.isdir(topic_dir):
        continue

    print(f"\n🔍 Processing topic: {topic}")
    doc_chunks = {}
    all_embeddings = []
    chunk_id = 0

    # Convert PDFs to .txt
    for file in os.listdir(topic_dir):
        filepath = os.path.join(topic_dir, file)

        if file.endswith(".pdf"):
            print(f"📄 Converting {file}...")
            text = pdf_to_text(filepath)
            txt_path = filepath.replace(".pdf", ".txt")
            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"✅ Saved text to {txt_path}")

    # Read all .txt files (original or converted)
    for file in os.listdir(topic_dir):
        if not file.endswith(".txt"):
            continue
        with open(os.path.join(topic_dir, file), "r", encoding="utf-8") as f:
            text = f.read()
            chunks = chunk_text(text)
            embeddings = embedder.encode(chunks)

            for emb, chunk in zip(embeddings, chunks):
                doc_chunks[str(chunk_id)] = chunk
                all_embeddings.append(emb)
                chunk_id += 1

    # Save output
    out_dir = os.path.join(OUT_BASE_DIR, topic)
    os.makedirs(out_dir, exist_ok=True)

    with open(os.path.join(out_dir, "embedded_papers.json"), "w", encoding="utf-8") as f:
        json.dump(doc_chunks, f)

    index = faiss.IndexFlatL2(len(all_embeddings[0]))
    index.add(np.array(all_embeddings).astype("float32"))
    faiss.write_index(index, os.path.join(out_dir, "faiss.index"))

    print(f"✅ Saved index for topic '{topic}' with {chunk_id} chunks.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ayode\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!



🔍 Processing topic: blisks
📄 Converting Bladed Disks VKI Roque Coral.pdf...
✅ Saved text to chatbot/papers_raw\blisks\Bladed Disks VKI Roque Coral.txt
✅ Saved index for topic 'blisks' with 128 chunks.

🔍 Processing topic: cbt_flutter
📄 Converting CBT Flutter Part 1 Notes VKI.pdf...
✅ Saved text to chatbot/papers_raw\cbt_flutter\CBT Flutter Part 1 Notes VKI.txt
✅ Saved index for topic 'cbt_flutter' with 91 chunks.


In [17]:
# 📁 project_root/
# ├── app.py
# ├── models/
# │   ├── minilm/  # from HuggingFace 'all-MiniLM-L6-v2'
# │   └── mpnet/   # from HuggingFace 'all-mpnet-base-v2'
# ├── data/
# │   ├── embedded_papers.json
# │   └── papers_raw/  # optional: raw text versions of PDFs
# ├── summarizer/
# │   └── t5_model/  # T5-small model
# └── requirements.txt

import streamlit as st
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer
import faiss
import json
import numpy as np

# --- Sidebar Model Toggle --- #
model_choice = st.sidebar.selectbox("Choose embedding model", ["MiniLM", "MPNet"])
model_path = f"chatbot/models/{'minilm' if model_choice == 'MiniLM' else 'mpnet'}"

st.title("📚 Research Assistant Chatbot")

# --- Load Embedding Model --- #
st.write(f"Loading embedding model: {model_choice}")
embedder = SentenceTransformer(model_path)

# --- Load FAISS index + chunk mapping --- #
st.write("Loading embedded documents...")
with open("chatbot/data/embedded_papers.json", "r") as f:
    doc_chunks = json.load(f)

index = faiss.read_index("data/faiss.index")

# --- Load T5 summarizer --- #
summarizer_model = T5ForConditionalGeneration.from_pretrained("summarizer/t5_model")
summarizer_tokenizer = T5Tokenizer.from_pretrained("summarizer/t5_model")

# --- User Input --- #
query = st.text_input("Ask a question about the papers:")

if query:
    
    query_vec = embedder.encode([query])
    D, I = index.search(np.array(query_vec).astype("float32"), k=3)

    # Retrieve top chunks
    relevant_chunks = [doc_chunks[str(i)] for i in I[0]]

    # Concatenate for summarizer input
    input_text = " ".join(relevant_chunks)
    input_ids = summarizer_tokenizer.encode("summarize: " + input_text, return_tensors="pt", max_length=512, truncation=True)

    summary_ids = summarizer_model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    output = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    st.subheader("Answer:")
    st.write(output)




FileNotFoundError: [Errno 2] No such file or directory: 'chatbot/data/embedded_papers.json'

In [18]:
pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.10.1-cp311-cp311-win_amd64.whl (8.1 MB)
     ---------------------------------------- 8.1/8.1 MB 6.1 MB/s eta 0:00:00
Collecting contourpy>=1.0.1
  Downloading contourpy-1.3.1-cp311-cp311-win_amd64.whl (219 kB)
     -------------------------------------- 219.8/219.8 kB 4.5 MB/s eta 0:00:00
Collecting cycler>=0.10
  Downloading cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting fonttools>=4.22.0
  Downloading fonttools-4.56.0-cp311-cp311-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 6.1 MB/s eta 0:00:00
Collecting kiwisolver>=1.3.1
  Downloading kiwisolver-1.4.8-cp311-cp311-win_amd64.whl (71 kB)
     ---------------------------------------- 72.0/72.0 kB 4.1 MB/s eta 0:00:00
Collecting pyparsing>=2.3.1
  Downloading pyparsing-3.2.3-py3-none-any.whl (111 kB)
     -------------------------------------- 111.1/111.1 kB 6.7 MB/s eta 0:00:00
Installing collected packages: pyparsing, kiwisolver, fonttools, cycler


[notice] A new release of pip available: 22.3 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
