In [2]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_mistralai.embeddings import MistralAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.graphs import Neo4jGraph
from docx import Document
from langchain.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from docx import Document as DocxDocument  

In [3]:
import os

directory_path = "/root/word-rag/word"
print("Files in directory:", os.listdir(directory_path))


Files in directory: ['chroma_db', '.ipynb_checkpoints', 'word1.docx']


In [4]:
import os
import faiss
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document as LangchainDocument
from sentence_transformers import SentenceTransformer
from docx import Document as WordDocument

# Initialize SentenceTransformer model
model_name = "sentence-transformers/all-MiniLM-L6-v2"
try:
    emb_model = SentenceTransformer(model_name)
except Exception as e:
    print(f"Error loading SentenceTransformer model: {e}")
    exit()

# Manually load Word documents
def load_word_documents(directory):
    all_docs = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".docx"):
                full_path = os.path.join(root, file)
                print(f"Attempting to read: {full_path}")
                try:
                    doc = WordDocument(full_path)
                    text = "\n".join([p.text for p in doc.paragraphs if p.text.strip()])
                    if text.strip():
                        all_docs.append({"content": text, "metadata": {"source": full_path}})
                    else:
                        print(f"Warning: File {full_path} contains no readable text.")
                except Exception as e:
                    print(f"Error reading {full_path}: {e}")
    return all_docs

# Load Word files from directory
directory_path = os.path.abspath("word")
print(f"Looking for files in: {directory_path}")
if not os.path.isdir(directory_path):
    print(f"Directory not found: {directory_path}")
    exit()

print("Files in directory:", os.listdir(directory_path))

all_docs = load_word_documents(directory_path)
if not all_docs:
    print("No Word documents found!")
    exit()

# Text splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents([
    LangchainDocument(page_content=doc["content"], metadata=doc["metadata"])
    for doc in all_docs
])

# Generate embeddings
def create_embeddings(documents):
    embeddings = []
    metadata = []
    for doc in documents:
        try:
            embedding = emb_model.encode(doc.page_content)
            embeddings.append(embedding)
            metadata.append(doc.metadata)
        except Exception as e:
            print(f"Error creating embedding for document: {doc.metadata}, Error: {e}")
    return np.array(embeddings), metadata

embeddings, metadatas = create_embeddings(documents)

# Initialize FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)  # L2 distance (Euclidean)
index.add(embeddings)

# Save the FAISS index
faiss.write_index(index, "faiss_index")

# Query the FAISS index
def search_faiss(query, top_k=5, max_distance=10):
    query_embedding = emb_model.encode(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for i, idx in enumerate(indices[0]):
        if idx < len(metadatas) and distances[0][i] < max_distance:
            results.append({"metadata": metadatas[idx], "distance": distances[0][i]})
    return results

# Example query
query = "Summarize the main points of the documents."
results = search_faiss(query, top_k=5, max_distance=10)
print(f"Query: {query}")
print("Results:")
for result in results:
    print(f"Retrieved doc: {result['metadata']['source']}, Distance: {result['distance']}")


Looking for files in: /root/word-rag/word
Files in directory: ['chroma_db', '.ipynb_checkpoints', 'word1.docx']
Attempting to read: /root/word-rag/word/word1.docx
Query: Summarize the main points of the documents.
Results:
Retrieved doc: /root/word-rag/word/word1.docx, Distance: 1.456721544265747
Retrieved doc: /root/word-rag/word/word1.docx, Distance: 1.4963897466659546
Retrieved doc: /root/word-rag/word/word1.docx, Distance: 1.8577896356582642


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model.save('./local_model')  


In [None]:
for doc in all_docs:
    print(f"Document content: {doc['content'][:100]}...")


In [None]:
from sentence_transformers import SentenceTransformer

# 加载本地或预训练模型（我用的是 all-MiniLM-L6-v2）
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# 测试
text = "test query"
embedding = model.encode(text)

print("Embedding vector:", embedding)
print("Vector length:", len(embedding))
