In [12]:
!pip install langchain langchain-community langchain-text-splitters chromadb faiss-cpu transformers sentence-transformers accelerate langchain_chroma langchain_huggingface pypdf

Collecting pypdf
  Downloading pypdf-6.2.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.2.0-py3-none-any.whl (326 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m326.6/326.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.2.0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os
import json
from tqdm import tqdm
from langchain_community.document_loaders import PyPDFLoader, TextLoader, WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document
import hashlib

In [24]:
# initialize embedding model
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True, "batch_size": 256},
)

In [29]:
# config
persistDir = "content/drive/MyDrive/bytestrike all data/chroma_store/"
dataDir = "/content/drive/MyDrive/bytestrike all data/dataToEmbed/"

driveDataDir = os.path.join(dataDir, "ByteStrikeDrive/")
website_urls = ["https://byte-strike.com/"]
emailsJson = os.path.join(dataDir, "emailsCleaned/cleaned_emails.json")
slackDir = os.path.join(dataDir, "cleanedSlack/")
minutesLinkDir = os.path.join(dataDir, "minuteslink/")

In [8]:
#website
web_loader = WebBaseLoader(website_urls)
web_docs = web_loader.load()
for d in web_docs:
    d.metadata["source"] = "website"
print(f"Loaded {len(web_docs)} pages from website.")


Loaded 1 pages from website.


In [13]:
#google docs
pdf_docs = []
for filename in os.listdir(driveDataDir):
    if filename.lower().endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(driveDataDir, filename))
        docs = loader.load()
        for d in docs:
            d.metadata["source"] = "pdf"
            d.metadata["filename"] = filename
        pdf_docs.extend(docs)
print(f"Loaded {len(pdf_docs)} PDF documents.")

Loaded 387 PDF documents.


In [14]:
#emails
json_docs = []
if os.path.exists(emailsJson):
    print(f"Loading JSON file: {emailsJson}")
    with open(emailsJson, "r", encoding="utf-8") as f:
        data = json.load(f)

    for i, msg in enumerate(data):
        body_text = msg.get("body", "").strip()
        if body_text:
            json_docs.append(Document(
                page_content=body_text,
                metadata={
                    "source": "email",
                    "from": msg.get("from", ""),
                    "to": msg.get("to", ""),
                    "email_id": i
                }
            ))
    print(f"Loaded {len(json_docs)} email documents.")
else:
    print(f"JSON file not found: {emailsJson}")

Loading JSON file: /content/drive/MyDrive/bytestrike all data/dataToEmbed/emailsCleaned/cleaned_emails.json
Loaded 5983 email documents.


In [17]:
#minuteslink
minutes_docs = []
for filename in os.listdir(minutesLinkDir):
    if filename.lower().endswith(".txt"):
        loader = TextLoader(os.path.join(minutesLinkDir, filename), encoding="utf-8")
        docs = loader.load()
        for d in docs:
            d.metadata["source"] = "minutes"
            d.metadata["filename"] = filename
        minutes_docs.extend(docs)
print(f"Loaded {len(minutes_docs)} minutes link documents.")

Loaded 6 minutes link documents.


In [18]:
#slack
slack_docs = []
for filename in os.listdir(slackDir):
    if filename.lower().endswith(".txt"):
        with open(os.path.join(slackDir, filename), "r", encoding="utf-8") as f:
            lines = f.readlines()
        for line in lines:
            if line.strip():
                slack_docs.append(Document(
                    page_content=line.strip(),
                    metadata={"source": "slack", "channel": filename.replace(".txt", "")}
                ))
print(f"Loaded {len(slack_docs)} Slack messages.")


Loaded 223 Slack messages.


In [21]:
#combine all docs
all_docs = web_docs + pdf_docs + json_docs + minutes_docs + slack_docs
print(f"Total combined documents: {len(all_docs)}")

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
)

chunks = splitter.split_documents(all_docs)
len(chunks)

Total combined documents: 6600


25014

In [27]:
vectorstore = Chroma(
    persist_directory=persistDir,
    embedding_function=embedding_model
)

In [30]:
batch_size = 256
for i in tqdm(range(0, len(chunks), batch_size), desc="Embedding new content"):
    batch = chunks[i : i + batch_size]
    vectorstore.add_documents(batch)
    # vectorstore.persist()

print(f"Stored all embeddings persistently in: {persistDir}")

Embedding new content: 100%|██████████| 98/98 [05:04<00:00,  3.11s/it]

Stored all embeddings persistently in: content/drive/MyDrive/bytestrike all data/chroma_store/



