In [85]:
!pip install chromadb
!pip install transformers
!pip install torch
!pip install scikit-learn



In [None]:
import chromadb
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [86]:
def initialize_chromadb(collection_name="enterprise_data"):
    """Initialize ChromaDB client and collection"""
    client = chromadb.Client()
    collection = client.get_or_create_collection(name=collection_name)
    return collection


In [87]:
def initialize_transformer_model():
    """Initialize transformer model and tokenizer"""
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

In [88]:
def generate_embedding(text, tokenizer, model):
    """Generate embeddings for given text using transformer model"""
    tokens = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        output = model(**tokens)
        embeddings = output.last_hidden_state
        embeddings = torch.mean(embeddings, dim=1)

    return embeddings.squeeze().numpy()


In [89]:
def add_company_data(collection, tokenizer, model):
    """Add initial company data to the collection"""
    company_data = {
        "documents": [
            """Our organization, founded in 2015, has established itself as a leading technology solutions provider
            in the enterprise software sector. With a global presence across 25 countries and a workforce of over
            5,000 skilled professionals, we deliver cutting-edge solutions to Fortune 500 companies.""",

            """Our offices maintain standard business hours from 8:00 AM to 6:00 PM (local time) on weekdays.
            Executive offices are accessible by appointment. Remote work options are available for eligible positions,
            with 24/7 support services maintained through our global operations centers.""",

            """We offer a comprehensive suite of enterprise solutions including cloud infrastructure services,
            cybersecurity frameworks, AI-powered analytics platforms, and custom software development. Our products
            are backed by industry-leading SLAs and 24/7 technical support."""
        ],
        "metadatas": [
            {"category": "company_profile", "department": "corporate_communications"},
            {"category": "operations", "department": "facilities"},
            {"category": "products_services", "department": "product_management"}
        ],
        "ids": ["corp_profile_001", "ops_hours_001", "products_001"],
        "embeddings": []
    }

    # Generate embeddings for each document
    for doc in company_data["documents"]:
        embedding = generate_embedding(doc, tokenizer, model)
        company_data["embeddings"].append(embedding)

    # Clear existing data and add new data
    collection.delete(ids=company_data["ids"])
    collection.add(**company_data)
    return "Company data successfully added to the database."

In [90]:
def search_documents(collection, query, tokenizer, model, n_results=2):
    """Perform semantic search on documents"""
    query_embedding = generate_embedding(query, tokenizer, model)
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )

    return format_search_results(results, query)

In [91]:
def format_search_results(results, query):
    """Format search results with similarity scores"""
    formatted_results = []

    for doc, meta, distance in zip(
        results['documents'][0],
        results['metadatas'][0],
        results['distances'][0]
    ):
        similarity = calculate_similarity(query, doc)
        formatted_results.append({
            'document': doc,
            'metadata': meta,
            'similarity_score': round(similarity, 3),
            'vector_distance': round(float(distance), 3)
        })

    return formatted_results


In [92]:
def calculate_similarity(query, text):
    """Calculate cosine similarity between query and document"""
    vectorizer = TfidfVectorizer().fit([query, text])
    vectors = vectorizer.transform([query, text])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

In [93]:
def filter_by_category(collection, category):
    """Filter documents by category"""
    return collection.query(
        query_texts=[""],
        n_results=10,
        where={"category": category}
    )


In [98]:
def update_document(collection, tokenizer, model, doc_id, new_text, new_metadata):
    """Update an existing document"""
    collection.delete(ids=[doc_id])
    collection.add(
        documents=[new_text],
        metadatas=[new_metadata],
        ids=[doc_id],
        embeddings=[generate_embedding(new_text, tokenizer, model)]
    )
    return f"Document {doc_id} successfully updated."


In [97]:
def delete_document(collection, doc_id):
    """Delete a document by its ID"""
    collection.delete(ids=[doc_id])
    return f"Document {doc_id} successfully deleted."

In [99]:
def main():
    # Initialize the collection and model
    collection = initialize_chromadb()
    tokenizer, model = initialize_transformer_model()

    # Add initial company data
    print(add_company_data(collection, tokenizer, model))

    # Example search
    print("\nSearching for information about working hours:")
    results = search_documents(collection, "What are the company's working hours?", tokenizer, model)
    for idx, result in enumerate(results, 1):
        print(f"\nResult {idx}:")
        print(f"Content: {result['document']}")
        print(f"Metadata: {result['metadata']}")
        print(f"Similarity Score: {result['similarity_score']}")

    # Example category filter
    print("\nFiltering documents by category 'products_services':")
    product_docs = filter_by_category(collection, "products_services")
    for doc, meta in zip(product_docs['documents'][0], product_docs['metadatas'][0]):
        print(f"\nDocument: {doc}")
        print(f"Metadata: {meta}")

    # Example update
    print("\nUpdating a document:")
    print(update_document(collection, tokenizer, model, "corp_profile_001", "Updated company profile text", {"category": "company_profile", "department": "corporate_communications"}))

    # Example delete
    print("\nDeleting a document:")
    print(delete_document(collection, "ops_hours_001"))

In [100]:
if __name__ == "__main__":
    main()


Company data successfully added to the database.

Searching for information about working hours:

Result 1:
Content: Our offices maintain standard business hours from 8:00 AM to 6:00 PM (local time) on weekdays. 
            Executive offices are accessible by appointment. Remote work options are available for eligible positions, 
            with 24/7 support services maintained through our global operations centers.
Metadata: {'category': 'operations', 'department': 'facilities'}
Similarity Score: 0.101

Result 2:
Content: Our organization, founded in 2015, has established itself as a leading technology solutions provider 
            in the enterprise software sector. With a global presence across 25 countries and a workforce of over 
            5,000 skilled professionals, we deliver cutting-edge solutions to Fortune 500 companies.
Metadata: {'category': 'company_profile', 'department': 'corporate_communications'}
Similarity Score: 0.033

Filtering documents by category 'products_




Document: We offer a comprehensive suite of enterprise solutions including cloud infrastructure services, 
            cybersecurity frameworks, AI-powered analytics platforms, and custom software development. Our products 
            are backed by industry-leading SLAs and 24/7 technical support.
Metadata: {'category': 'products_services', 'department': 'product_management'}

Updating a document:
Document corp_profile_001 successfully updated.

Deleting a document:
Document ops_hours_001 successfully deleted.
