<a href="https://colab.research.google.com/github/alice410451027/Multi-format-Document-Embedding-Pipeline/blob/main/Multi_format_Document_Embedding_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Embedding Documents into Vectors

### 1. Create Upload Folder

In [None]:
import os
upload_dir = "uploaded_docs"
os.makedirs(upload_dir, exist_ok=True)
print(f"Please place your .txt, .pdf, .docx files into this folder: {upload_dir}")

<font color="red">Please manually upload your file to continue</font>

### 2. Install Required Packages and Import Modules

In [None]:
!pip install -U langchain langchain-community pypdf python-docx sentence-transformers faiss-cpu

In [None]:
from langchain_community.document_loaders import TextLoader, PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

### 3. Custom E5 Embedding Class (recommended by E5 documentation)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

class CustomE5Embedding(HuggingFaceEmbeddings):
    def embed_documents(self, texts):
        texts = [f"passage: {t}" for t in texts]
        return super().embed_documents(texts)

    def embed_query(self, text):
        return super().embed_query(f"query: {text}")

### 4. Load Documents

In [None]:
from langchain.schema import Document
import pandas as pd

folder_path = upload_dir
documents = []

for file in os.listdir(folder_path):
    path = os.path.join(folder_path, file)
    if file.endswith(".txt"):
        loader = TextLoader(path)
        documents.extend(loader.load())
    elif file.endswith(".pdf"):
        loader = PyPDFLoader(path)
        documents.extend(loader.load())
    elif file.endswith(".docx"):
        loader = UnstructuredWordDocumentLoader(path)
        documents.extend(loader.load())
    elif file.endswith(".csv"):
        df = pd.read_csv(path)
        for idx, row in df.iterrows():
            content = row.to_string(index=False)
            documents.append(Document(page_content=content))
    else:
        continue

### 5. Create Vector Database

In [None]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
split_docs = splitter.split_documents(documents)

In [None]:
import os
print("Files inside uploaded_docs:", os.listdir("uploaded_docs"))

In [None]:
print(len(split_docs))

In [None]:
embedding_model = CustomE5Embedding(model_name="intfloat/multilingual-e5-small")
vectorstore = FAISS.from_documents(split_docs, embedding_model)

### 6. Save Vectorstore

In [None]:
vectorstore.save_local("faiss_db")
!zip -r faiss_db.zip faiss_db
print("✅ Vector database has been compressed and saved as 'faiss_db.zip'.")