# Qdrant Database Import/Export with LangChain

Notebook này hướng dẫn cách sử dụng LangChain và Qdrant trên Google Colab, sau đó export vector database để sử dụng ở local.

## Phần 1: Cài đặt trên Colab

Đầu tiên, chúng ta cần cài đặt các thư viện cần thiết trên Colab:

In [None]:
# Cài đặt các thư viện cần thiết
!pip install langchain langchain-openai langchain-community langchain-huggingface langchain-qdrant\
    qdrant-client python-dotenv pydantic

## Phần 2: Upload Dữ liệu Sản phẩm

Chúng ta cần upload file dữ liệu sản phẩm lên Colab:

In [None]:
# Sử dụng widget upload file của Colab
from google.colab import files

uploaded = files.upload()

# Lấy tên file đầu tiên được upload
data_file = list(uploaded.keys())[0]
print(f"Đã upload file: {data_file}")

## Phần 3: Thiết lập Qdrant trong Colab

Chúng ta sẽ sử dụng Qdrant locally trong Colab (không cần Docker):

In [None]:
import os

from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

# Khởi tạo Qdrant local trong Colab (sẽ lưu vào RAM)
qdrant_client = QdrantClient(":memory:")

# Thiết lập các biến môi trường


# Thiết lập OpenAI API key (thay thế với API key của bạn)
os.environ["OPENAI_API_KEY"] = "your-openai-api-key-here"  # Thay thế bằng key của bạn

# Các biến cấu hình
COLLECTION_NAME = "product_data"
EMBEDDING_MODEL_NAME = "AITeamVN/Vietnamese_Embedding"
VECTOR_SIZE = 1024  # Kích thước vector cho mô hình AITeamVN/Vietnamese_Embedding

# Tạo collection
if not qdrant_client.collection_exists(COLLECTION_NAME):
    qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
    )

    print(f"Đã tạo collection '{COLLECTION_NAME}' trong Qdrant")
else:
    print(
        f"Đã tồn tại collection '{COLLECTION_NAME}' trong Qdrant, sẽ sử dụng lại hoặc tạo lại nếu cần."
    )

## Phần 4: Triển khai code LangChain và Text Processor

Chúng ta sẽ port code từ project vào đây:

In [None]:
import json
from typing import Any, Dict, List

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Cấu hình chunking
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200


# Text Processor class
class TextProcessor:
    """Process text data for embedding."""

    def __init__(
        self, chunk_size: int = CHUNK_SIZE, chunk_overlap: int = CHUNK_OVERLAP
    ):
        """Initialize text processor with chunking parameters."""
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=["\n\n", "\n", " ", ""],
        )

    def load_data(self, file_path: str) -> List[Dict[str, Any]]:
        """Load product data from JSON file."""
        try:
            with open(file_path, "r", encoding="utf-8") as file:
                data = json.load(file)
                print(f"Loaded {len(data)} products from {file_path}")
                return data
        except Exception as e:
            print(f"Error loading data: {e}")
            return []

    def product_to_text(self, product: Dict[str, Any]) -> str:
        """Convert a product dictionary to formatted text."""
        product_name = product.get("Tên", "Unknown Product")
        product_text = [f"Tên sản phẩm: {product_name}"]

        # Add all other properties
        for key, value in product.items():
            if key != "Tên":  # Skip name as we already included it
                product_text.append(f"{key}: {value}")

        return "\n".join(product_text)

    def chunk_text(self, text: str) -> List[str]:
        """Split text into chunks for embedding."""
        chunks = self.text_splitter.split_text(text)
        return chunks

    def chunk_product(self, product: Dict[str, Any]) -> List[Dict[str, Any]]:
        """Process a single product into chunks with metadata."""
        product_text = self.product_to_text(product)

        # For products, we might want smaller chunks if there's a lot of info
        chunks = self.chunk_text(product_text)
        product_chunks = []

        for i, chunk in enumerate(chunks):
            # Create a chunk document with metadata
            product_chunks.append(
                {
                    "text": chunk,
                    "metadata": {
                        "product_id": product.get("id", i),
                        "product_name": product.get("Tên", "Unknown"),
                        "chunk_id": i,
                        "total_chunks": len(chunks),
                        **product,  # Include all product data in metadata
                    },
                }
            )

        return product_chunks

    def process_all_products(
        self, products: List[Dict[str, Any]]
    ) -> List[Dict[str, Any]]:
        """Process all products into chunks ready for embedding."""
        all_chunks = []

        for i, product in enumerate(products):
            # Add an ID if not present
            if "id" not in product:
                product["id"] = i

            product_chunks = self.chunk_product(product)
            all_chunks.extend(product_chunks)

        print(f"Created {len(all_chunks)} chunks from {len(products)} products")
        return all_chunks


# VectorStore class
class VectorStore:
    """Manages vector database operations with Qdrant."""

    def __init__(
        self,
        client=None,
        collection_name: str = COLLECTION_NAME,
        embedding_model: str = EMBEDDING_MODEL_NAME,
    ):
        """Initialize the vector database connection."""
        self.collection_name = collection_name
        self.embedding_model = HuggingFaceEmbeddings(
            model_name=embedding_model,
            model_kwargs={"device": "cuda"},
            multi_process=True,
        )

        # Use provided client or create in-memory client
        self.client = client or QdrantClient(":memory:")

        # Initialize Langchain's Qdrant wrapper
        self.vectorstore = None

    def initialize_vectorstore(self, force_recreate: bool = False) -> QdrantVectorStore:
        """Initialize the vector store for LangChain operations."""
        self.vectorstore = QdrantVectorStore(
            client=self.client,
            collection_name=self.collection_name,
            embedding=self.embedding_model,
        )
        return self.vectorstore

    def get_vectorstore(self) -> QdrantVectorStore:
        """Get the initialized vector store."""
        if not self.vectorstore:
            return self.initialize_vectorstore()
        return self.vectorstore

    def prepare_documents(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Convert raw data to document format for embedding and storage."""
        documents = []

        for i, product in enumerate(data):
            # Extract product text and metadata
            text = product.get("text", "")
            metadata = product.get("metadata", {})

            # Create document with metadata
            document = {
                "product_id": metadata.get("product_id", i),
                "product_name": metadata.get("product_name", "Unknown"),
                "text": text,
                "metadata": metadata,
            }
            documents.append(document)

        return documents

    def index_documents(self, documents: List[Dict[str, Any]]) -> None:
        """Index documents into the vector database."""
        if not self.vectorstore:
            self.initialize_vectorstore(force_recreate=True)  # Add force_recreate here

        # Extract text and metadata
        texts = [doc["text"] for doc in documents]
        metadatas = [doc["metadata"] for doc in documents]

        # Add texts to the vector store
        self.vectorstore.add_texts(texts=texts, metadatas=metadatas)
        print(f"Indexed {len(documents)} documents into Qdrant.")

## Phần 5: Nạp dữ liệu vào Qdrant

Bây giờ chúng ta sẽ nạp dữ liệu JSON vào Qdrant:

In [None]:
# Nạp và xử lý dữ liệu
text_processor = TextProcessor()
vector_store = VectorStore(client=qdrant_client)

# Load data
raw_data = text_processor.load_data(data_file)

# Process data
processed_chunks = text_processor.process_all_products(raw_data)

# Prepare documents
documents = vector_store.prepare_documents(processed_chunks)

# Index documents
vector_store.index_documents(documents)

## Phần 6: Export Qdrant Collection

Bây giờ chúng ta sẽ export collection từ Qdrant để có thể import vào local:

In [None]:
import pickle


def export_qdrant_collection(client, collection_name, output_file="qdrant_export.pkl"):
    """Export a Qdrant collection to a portable file format."""
    # Get collection info
    collection_info = client.get_collection(collection_name=collection_name)

    # Get all points with their vectors and payloads
    # We'll retrieve in batches to handle large collections
    limit = 1000
    offset = 0
    all_points = []

    while True:
        points = client.scroll(
            collection_name=collection_name,
            limit=limit,
            offset=offset,
            with_vectors=True,
            with_payload=True,
        )[0]

        if not points:
            break

        all_points.extend(points)
        offset += limit

        print(f"Retrieved {len(all_points)} points so far...")

        if len(points) < limit:
            break

    # Create the export data structure
    export_data = {
        "collection_info": {
            "name": collection_name,
            "vector_size": collection_info.config.params.vectors.size,
            "vector_distance": collection_info.config.params.vectors.distance.name,
        },
        "points": [
            {"id": point.id, "vector": point.vector, "payload": point.payload}
            for point in all_points
        ],
    }

    # Save to file
    with open(output_file, "wb") as f:
        pickle.dump(export_data, f)

    print(f"Exported {len(all_points)} points to {output_file}")
    return output_file


# Export collection
export_file = export_qdrant_collection(qdrant_client, COLLECTION_NAME)

## Phần 7: Download Export File

Tải xuống file export để sử dụng trong môi trường local:

In [None]:
from google.colab import files

files.download("qdrant_export.pkl")

## Phần 8: Test Vector Search trên Colab

Kiểm tra xem vector search có hoạt động chính xác không:

In [None]:
# Test truy vấn
vectorstore = vector_store.get_vectorstore()
results = vectorstore.similarity_search_with_score("điện thoại có camera tốt nhất", k=3)

# Display results
for i, (doc, score) in enumerate(results):
    print(f"\nKết quả #{i + 1} (score: {score:.4f})")
    print(f"Sản phẩm: {doc.metadata.get('product_name', 'Unknown')}")
    print(f"Nội dung: {doc.page_content[:150]}...")

# Hướng dẫn Import dữ liệu trên Local

Sau khi bạn đã export và download file `qdrant_export.pkl`, dưới đây là các bước để import nó vào môi trường local. Hãy tạo file Python mới trên local với nội dung sau:

In [None]:
# Code này sẽ chạy trên máy local của bạn, không phải trên Colab
# Lưu nội dung này vào file import_qdrant.py

import argparse
import os

from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams


def import_to_qdrant(export_file, host="localhost", port=6333):
    """Import a previously exported Qdrant collection."""
    print(f"Connecting to Qdrant at {host}:{port}...")
    client = QdrantClient(host=host, port=port)

    # Load the export data
    with open(export_file, "rb") as f:
        export_data = pickle.load(f)

    collection_info = export_data["collection_info"]
    points = export_data["points"]

    collection_name = collection_info["name"]
    vector_size = collection_info["vector_size"]
    distance_str = collection_info["vector_distance"]

    # Map string distance to enum
    distance_map = {
        "COSINE": Distance.COSINE,
        "EUCLID": Distance.EUCLID,
        "DOT": Distance.DOT,
    }
    distance = distance_map.get(distance_str, Distance.COSINE)

    # Check if collection exists and recreate it
    collections = client.get_collections().collections
    collection_names = [c.name for c in collections]

    if collection_name in collection_names:
        print(f"Collection '{collection_name}' already exists. Recreating...")
        client.delete_collection(collection_name=collection_name)

    # Create collection
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=distance),
    )

    print(
        f"Created collection '{collection_name}' with {vector_size} vector dimensions"
    )

    # Import points in batches
    batch_size = 100
    total_points = len(points)

    for i in range(0, total_points, batch_size):
        batch = points[i : i + batch_size]

        # Format points for upsert
        upsert_points = [
            {"id": point["id"], "vector": point["vector"], "payload": point["payload"]}
            for point in batch
        ]

        # Upsert points
        client.upsert(collection_name=collection_name, points=upsert_points)

        print(f"Imported {min(i + batch_size, total_points)}/{total_points} points")

    # Verify import
    count = client.count(collection_name=collection_name).count
    print(f"Import complete. Collection '{collection_name}' now has {count} points.")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Import Qdrant collection from export file"
    )
    parser.add_argument(
        "export_file", help="Path to the export file (qdrant_export.pkl)"
    )
    parser.add_argument("--host", default="localhost", help="Qdrant host")
    parser.add_argument("--port", type=int, default=6333, help="Qdrant port")

    args = parser.parse_args()
    import_to_qdrant(args.export_file, args.host, args.port)

## Thực hiện Import trên Local

Sau khi bạn đã tạo script `import_qdrant.py` trên local và tải file `qdrant_export.pkl` từ Colab, chạy lệnh sau trên terminal của bạn:

```bash
python import_qdrant.py qdrant_export.pkl --host localhost --port 6333
```

Sau đó, bạn có thể sử dụng collection đã import với code LangChain của mình.