In [None]:
import json
import pymongo
from bson.objectid import ObjectId
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm
from typing import List, Dict

# MongoDB connection details
MONGO_URI = ""
DB_NAME = ""
COLLECTION_NAME = ""

# Initialize MongoDB client
client = pymongo.MongoClient(MONGO_URI)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]
print("Connected to MongoDB:")

# Load the sentence-transformer model
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("Loaded sentence-transformer model")

In [None]:
def load_json_data(file_path: str) -> Dict:
    """Load JSON data from a file."""
    with open(file_path, 'r') as f:
        return json.load(f)

def split_large_elements(data: List[str], max_length: int = 512) -> List[str]:
    """Split large elements into smaller chunks based on character count."""
    result = []
    for item in data:
        if len(item) > max_length:
            chunks = [item[i:i + max_length] for i in range(0, len(item), max_length)]
            result.extend(chunks)
        else:
            result.append(item)
    return result

def create_vector_embedding(text: str) -> List[float]:
    """Create a vector embedding using the sentence-transformer model."""
    embedding = embedding_model.encode(text)
    return embedding.tolist()

def process_json_data(data: Dict) -> List[Dict]:
    """Process JSON data to create embeddings for each key's array elements."""
    processed_data = []
    for key, value in data.items():
        if isinstance(value, list):
            value = [str(item) for item in value]
            value = split_large_elements(value)
            for idx, element in enumerate(value):
                embedding = create_vector_embedding(element)
                processed_data.append({
                    "text": element,
                    "embedding": embedding,
                    "key": key,
                    "index": idx
                })
    return processed_data

def split_large_elements2(data: str, max_length: int = 512) -> List[str]:
    """Split large elements into smaller chunks based on character count."""
    result = []
    if len(data) > max_length:
        chunks = [data[i:i + max_length] for i in range(0, len(data), max_length)]
        result.extend(chunks)
    else:
        result.append(data)
    return result

def process_collection_data(data: List[Dict]) -> List[Dict]:
    """Process collection data to create embeddings for each document."""
    processed_data = []
    for doc in tqdm(data):
        str_doc = doc.get("text", str(doc))
        value = split_large_elements2(str_doc)
        
        for idx, element in enumerate(value):
            embedding = create_vector_embedding(element)  
            processed_data.append({
                "text": element,
                "embedding": embedding, 
                "key": doc.get("label"),
                "index": idx 
            })
    return processed_data

from pymongo import MongoClient
from typing import List, Dict

def sync_with_mongodb(new_data: List[Dict]):
    """Directly dump new data to MongoDB."""
    # Insert all new data into MongoDB
    if new_data:
        collection.insert_many(new_data)
        print(f"Inserted {len(new_data)} new records into the database.")
    else:
        print("No new data to insert.")


def task1(json_file_path: str):
    """Main function to load JSON data, create embeddings, and sync with MongoDB."""
    data = load_json_data(json_file_path)
    print("Loaded Data:", data)
    print(data.items())
    processed_data = process_json_data(data)
    print("Processed Data:", processed_data)
    sync_with_mongodb(processed_data)
    
def run_data_aggr(collection):
    """Aggregate project data with associated articles."""
    pipeline = [
        {
            "$lookup": {
                "from": "article",
                "localField": "_id",
                "foreignField": "projectId",
                "as": "articles"
            }
        }
    ]
    return list(collection.aggregate(pipeline))
    
    
def task2():
    """Main function to load data from project and article collections and sync with MongoDB vector store."""
    MONGO_URI2 = ""
    DB_NAME2 = ""
    COLLECTION_NAME2 = ""

    client = pymongo.MongoClient(MONGO_URI2)
    db = client[DB_NAME2]
    collection = db[COLLECTION_NAME2]
    print("Connected to MongoDB:")
    
    raw_data = run_data_aggr(collection)
    
    processed_data = process_collection_data(raw_data)
    print("Processed Data:", processed_data)
    sync_with_mongodb(processed_data)
    
    


In [None]:
if __name__ == "__main__":
    json_file_path = r"C:\Users\anant\Desktop\website repo\projects playground\about_me_llm\data_about_me_2.json"  # Replace with your JSON file path
    task1(json_file_path)
    task2()
    

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_store = MongoDBAtlasVectorSearch(
        collection=collection,
        embedding=embeddings,
        text_key="text",
        embedding_key="embedding",
        index_name="default",
        relevance_score_fn = "cosine" # Replace with your index name
        )
results = vector_store.similarity_search(query = "Tell me about ananths work experience", k=10)

for result in results:
    print(result.page_content)