In [28]:
!pip install pymilvus ollama dspy openai pandas nltk

I0000 00:00:1735807595.565955    3055 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers


Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.9.1


# Sentence level embeddings

In [1]:
from ollama import Client
import numpy as np
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility

ollama = Client(host='http://ollama:11434')

def get_ollama_embeddings(text):
    embedding_output = ollama.embeddings(
        model='mxbai-embed-large',
        prompt=text
    )
    return np.array(embedding_output.embedding, dtype=np.float32)  # Convert to float32

# Connect to Milvus server - remove http:// prefix
connections.connect(host='milvus', port='19530')

# Sample documents
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England."
]

# Generate embeddings using Ollama
embeddings = [get_ollama_embeddings(doc) for doc in docs]
embeddings = np.array(embeddings, dtype=np.float32)  # Ensure float32 type

# Define collection schema
fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=100),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=500),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=1024)
]

schema = CollectionSchema(fields, "Text embeddings collection")
collection_name = "text_embeddings"

# Drop existing collection if it exists
if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)

# Create collection
collection = Collection(collection_name, schema)

# Insert data
collection.insert([
    [str(i) for i in range(len(docs))],  # id
    docs,  # text
    embeddings.tolist()  # embeddings as list
])

# Create index for vector field
index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}
collection.create_index("embeddings", index_params)
collection.load()

print(f"Created collection with {collection.num_entities} entities")
connections.disconnect("default")


Created collection with 0 entities


# Paragraph Embeddings

In [2]:
from ollama import Client
import numpy as np
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
import nltk

# Set NLTK data directory
nltk.data.path.append("./nltk_data")
nltk.download('punkt', download_dir="./nltk_data")
nltk.download('punkt_tab', download_dir="./nltk_data")

from nltk.tokenize import sent_tokenize

ollama = Client(host='http://ollama:11434')

def get_ollama_embeddings(text):
    embedding_output = ollama.embeddings(
        model='mxbai-embed-large',
        prompt=text
    )
    return np.array(embedding_output.embedding, dtype=np.float32)

def chunk_document(text, chunk_size=3):
    """Split document into chunks of sentences"""
    sentences = sent_tokenize(text)
    chunks = []
    
    for i in range(0, len(sentences), chunk_size):
        chunk = ' '.join(sentences[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Connect to Milvus
connections.connect(host='milvus', port='19530')

# Sample documents with multiple sentences
docs = [
    """Alan Turing was a brilliant mathematician and computer scientist. 
    He played a crucial role in breaking the Enigma code during World War II. 
    His work laid the foundation for modern computing and artificial intelligence. 
    The Turing test, which he proposed, remains influential in AI research today.""",
    
    """The development of artificial intelligence has transformed many industries. 
    Machine learning models can now perform complex tasks with high accuracy. 
    Deep learning has revolutionized fields like computer vision and natural language processing. 
    However, challenges remain in achieving human-like general intelligence."""
]

# Process documents into chunks
processed_docs = []
doc_ids = []
chunk_texts = []

for doc_id, doc in enumerate(docs):
    chunks = chunk_document(doc)
    for chunk_id, chunk in enumerate(chunks):
        processed_docs.append({
            'id': f"{doc_id}_{chunk_id}",
            'doc_id': str(doc_id),
            'text': chunk
        })
        chunk_texts.append(chunk)

# Generate embeddings for all chunks
embeddings = [get_ollama_embeddings(chunk) for chunk in chunk_texts]
embeddings = np.array(embeddings, dtype=np.float32)

# Define collection schema
fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=100),
    FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=100),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2000),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=1024)
]

schema = CollectionSchema(fields, "Text embeddings collection")
collection_name = "text_embeddings"

# Drop existing collection if it exists
if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)

# Create and load collection
collection = Collection(collection_name, schema)

# Insert data
collection.insert([
    [doc['id'] for doc in processed_docs],         # id
    [doc['doc_id'] for doc in processed_docs],     # doc_id
    [doc['text'] for doc in processed_docs],       # text
    embeddings.tolist()                            # embeddings
])

# Create index
index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}
collection.create_index("embeddings", index_params)
collection.load()

# Example search function
def search_documents(query_text, top_k=3):
    query_embedding = get_ollama_embeddings(query_text)
    
    search_params = {
        "metric_type": "COSINE",
        "params": {"nprobe": 10},
    }
    
    results = collection.search(
        data=[query_embedding.tolist()],
        anns_field="embeddings",
        param=search_params,
        limit=top_k,
        output_fields=["doc_id", "text"]
    )
    
    return results[0]

# Example usage
query = "What is Alan Turing's contribution to computing?"
results = search_documents(query)

print("\nSearch Results:")
for hit in results:
    print(f"\nScore: {hit.score}")
    print(f"Document ID: {hit.entity.get('doc_id')}")
    print(f"Text: {hit.entity.get('text')}")

connections.disconnect("default")

[nltk_data] Downloading package punkt to ./nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to ./nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!



Search Results:

Score: 0.8507258892059326
Document ID: 0
Text: Alan Turing was a brilliant mathematician and computer scientist. He played a crucial role in breaking the Enigma code during World War II. His work laid the foundation for modern computing and artificial intelligence.

Score: 0.6764159202575684
Document ID: 0
Text: The Turing test, which he proposed, remains influential in AI research today.

Score: 0.5447987914085388
Document ID: 1
Text: The development of artificial intelligence has transformed many industries. Machine learning models can now perform complex tasks with high accuracy. Deep learning has revolutionized fields like computer vision and natural language processing.


# Very Basic RAG

In [None]:
from ollama import Client
import numpy as np
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
import nltk

# Set NLTK data directory
nltk.data.path.append("./nltk_data")
nltk.download('punkt', download_dir="./nltk_data")
nltk.download('punkt_tab', download_dir="./nltk_data")

from nltk.tokenize import sent_tokenize

ollama = Client(host='http://ollama:11434')

def get_ollama_embeddings(text):
    embedding_output = ollama.embeddings(
        model='mxbai-embed-large',
        prompt=text
    )
    return np.array(embedding_output.embedding, dtype=np.float32)

def chunk_document(text, chunk_size=3):
    """Split document into chunks of sentences"""
    sentences = sent_tokenize(text)
    chunks = []
    
    for i in range(0, len(sentences), chunk_size):
        chunk = ' '.join(sentences[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Connect to Milvus
connections.connect(host='milvus', port='19530')

# Sample documents with multiple sentences
docs = [
    """Alan Turing was a brilliant mathematician and computer scientist. 
    He played a crucial role in breaking the Enigma code during World War II. 
    His work laid the foundation for modern computing and artificial intelligence. 
    The Turing test, which he proposed, remains influential in AI research today.""",
    
    """The development of artificial intelligence has transformed many industries. 
    Machine learning models can now perform complex tasks with high accuracy. 
    Deep learning has revolutionized fields like computer vision and natural language processing. 
    However, challenges remain in achieving human-like general intelligence."""
]

# Process documents into chunks
processed_docs = []
doc_ids = []
chunk_texts = []

for doc_id, doc in enumerate(docs):
    chunks = chunk_document(doc)
    for chunk_id, chunk in enumerate(chunks):
        processed_docs.append({
            'id': f"{doc_id}_{chunk_id}",
            'doc_id': str(doc_id),
            'text': chunk
        })
        chunk_texts.append(chunk)

# Generate embeddings for all chunks
embeddings = [get_ollama_embeddings(chunk) for chunk in chunk_texts]
embeddings = np.array(embeddings, dtype=np.float32)

# Define collection schema
fields = [
    FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=100),
    FieldSchema(name="doc_id", dtype=DataType.VARCHAR, max_length=100),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=2000),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=1024)
]

schema = CollectionSchema(fields, "Text embeddings collection")
collection_name = "text_embeddings"

# Drop existing collection if it exists
if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)

# Create and load collection
collection = Collection(collection_name, schema)

# Insert data
collection.insert([
    [doc['id'] for doc in processed_docs],         # id
    [doc['doc_id'] for doc in processed_docs],     # doc_id
    [doc['text'] for doc in processed_docs],       # text
    embeddings.tolist()                            # embeddings
])

# Create index
index_params = {
    "metric_type": "COSINE",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 128}
}
collection.create_index("embeddings", index_params)
collection.load()

# Example search function
def search_documents(query_text, top_k=3):
    query_embedding = get_ollama_embeddings(query_text)
    
    search_params = {
        "metric_type": "COSINE",
        "params": {"nprobe": 10},
    }
    
    results = collection.search(
        data=[query_embedding.tolist()],
        anns_field="embeddings",
        param=search_params,
        limit=top_k,
        output_fields=["doc_id", "text"]
    )
    
    return results[0]

# Example usage
query = "What is Alan Turing's contribution to computing?"
 
print("\nSearch Results:")
for hit in results:
    print(f"\nScore: {hit.score}")
    print(f"Document ID: {hit.entity.get('doc_id')}")
    print(f"Text: {hit.entity.get('text')}")

connections.disconnect("default")

context = "\n".join([hit.entity.get('text') for hit in results])
question = "What is Alan Turing's contribution to computing?"

prompt = f"""Context: {context}\n\nQuestion: {question}\n\nAnswer:"""
response = ollama.chat(model='llama3.2:3b', messages=[
            {"role": "system", "content": "You are a helpful assistant. Answer the question based on the given context."},
            {"role": "user", "content": prompt}
        ])

print(response['message']['content'])

Alan Turing made two significant contributions to computing:

1. Breaking the Enigma code during World War II, which laid the foundation for modern cryptography.
2. Laying the groundwork for modern computing by proposing the concept of the "Turing Machine" and advocating for the development of practical computers.

Additionally, his work in artificial intelligence is also a notable contribution, as he proposed the Turing test, which remains influential in AI research today.
