In [1]:
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
import os

In [2]:
# MongoDB connection details
MONGO_URI = os.environ["MONGODB_URI"]  # Replace with your MongoDB URI
SOURCE_DB = "sample_mflix"
SOURCE_COLLECTION = "movies"
TARGET_DB = "vector_mflix1"
TARGET_COLLECTION = "movies_with_vectors"

In [3]:
# Initialize MongoDB client
client = MongoClient(MONGO_URI)
source_db = client[SOURCE_DB]
source_collection = source_db[SOURCE_COLLECTION]
target_db = client[TARGET_DB]
target_collection = target_db[TARGET_COLLECTION]

In [4]:
# Load pre-trained embedding model (e.g., Sentence-Transformers)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
def generate_embeddings(text):
    """Generate vector embeddings for the given text using the model."""
    if text:
        return embedding_model.encode(text).tolist()
    return None

In [6]:
# Step 2: Define batch size and process documents in batches
batch_size = 500
processed_count = 0

total_records = source_collection.count_documents({})  # Total number of records in the source collection

# Step 3: Process documents in batches
for i in range(0, total_records, batch_size):
    # Step 3: Fetch the next batch of documents
    batch = list(source_collection.find().skip(i).limit(batch_size))
    
    if not batch:  # Stop the loop if no more documents are available
        break
    
    processed_batch = []
    
    for movie in batch:
        # Generate embeddings for the plot
        plot = movie.get("plot", "")
        plot_embedding = generate_embeddings(plot)

        # Prepare the new document
        processed_document = {
            "_id": movie["_id"],  # Keep the same _id
            "title": movie.get("title", ""),
            "plot": plot,
            "plot_embedding": plot_embedding,
            "genres": movie.get("genres", []),
            "cast": movie.get("cast", []),
            "directors": movie.get("directors", []),
            "year": movie.get("year", None),
        }
        
        processed_batch.append(processed_document)
    
        # Insert the processed batch into the target collection
    try:
        target_collection.insert_many(processed_batch)
        processed_count += len(processed_batch)
        print(f"Processed and inserted {processed_count} records of {total_records}.")
    except Exception as e:
        print(f"An error occurred: {e}")

print("All batches processed and inserted into the new collection.")


Processed and inserted 500 records of 21349.
Processed and inserted 1000 records of 21349.
Processed and inserted 1500 records of 21349.
Processed and inserted 2000 records of 21349.
Processed and inserted 2500 records of 21349.
Processed and inserted 3000 records of 21349.
Processed and inserted 3500 records of 21349.
Processed and inserted 4000 records of 21349.
Processed and inserted 4500 records of 21349.
Processed and inserted 5000 records of 21349.
Processed and inserted 5500 records of 21349.
Processed and inserted 6000 records of 21349.
Processed and inserted 6500 records of 21349.
Processed and inserted 7000 records of 21349.
Processed and inserted 7500 records of 21349.
Processed and inserted 8000 records of 21349.
Processed and inserted 8500 records of 21349.
Processed and inserted 9000 records of 21349.
Processed and inserted 9500 records of 21349.
Processed and inserted 10000 records of 21349.
Processed and inserted 10500 records of 21349.
Processed and inserted 11000 reco

In [None]:
# Iterate through movies, generate embeddings, and insert into the new collection
batch_size = 500  # Batch processing size
processed_count = 0

for movie in source_collection.find():
    # Generate embeddings for the plot
    plot = movie.get("plot", "")
    plot_embedding = generate_embeddings(plot)

    # Prepare the new document
    new_document = {
        "_id": movie["_id"],  # Keep the same _id
        "title": movie.get("title", ""),
        "plot": plot,
        "plot_embedding": plot_embedding,
        "genres": movie.get("genres", []),
        "cast": movie.get("cast", []),
        "directors": movie.get("directors", []),
        "year": movie.get("year", None),
    }

    # Insert into the target collection
    target_collection.insert_one(new_document)
    processed_count += 1
    
    

    # Print progress
    if processed_count % batch_size == 0:
        print(f"Processed {processed_count} movies...")

print(f"Completed processing {processed_count} movies.")
