# Generate the embeddings from the text files that need to be chunked and then ingested 


In [1]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import ollama
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os 
def load_text(file_path):
    try:
        with open(file_path, 'r') as file:
            text = file.read()
            print(text[0:10000])
    except FileNotFoundError:
        print("File not found. Please make sure {0} exists.".format(file_path))
        return None
    return text 
# text = load_text("DataLake/magma.txt")

In [3]:
# Text chunking with window and overlap of 500 characters 

def chunk_text(text, window_size=300, overlap=100):
    """
    Chunk text into overlapping segments of specified window size
    
    Args:
        text (str): Input text to chunk
        window_size (int): Size of each chunk in characters
        overlap (int): Number of overlapping characters between chunks
        
    Returns:
        list: List of text chunks
    """
    chunks = []
    start = 0
    
    while start < len(text):
        # Get chunk of window_size or remaining text if shorter
        end = min(start + window_size, len(text))
        chunk = text[start:end]
        
        # Add chunk if it's not empty
        if chunk.strip():
            chunks.append(chunk)
            
        # Move start position by window_size - overlap
        start = start + window_size - overlap
        
    return chunks

# Create chunks with 500 char window and 100 char overlap
# data_chunks = chunk_text(text, window_size=1000, overlap=200)

# print(f"Created {len(data_chunks)} chunks")
# print(f"\nFirst chunk sample:\n{data_chunks[0][:200]}...")


In [4]:
# load the model 
def load_model(model_name):
    """
    Load a SentenceTransformer model from Hugging Face
    
    Args:
        model_name (str): Name of the model to load 
    """
    model = SentenceTransformer(model_name, trust_remote_code=True)
    return model

# model = load_model("nomic-ai/CodeRankEmbed")


In [5]:
# start the embedding process where the chunks that are created are embedded using the huggingface model 

def embed_chunks(chunks, model):
    """
    Embed text chunks using the provided model
    
    Args:
        chunks (list): List of text chunks
        model: SentenceTransformer model
        
    Returns:
        np.ndarray: Array of embeddings
    """
    sentences = chunks
    embeddings = model.encode(sentences)
    return embeddings

# embeddings = embed_chunks(data_chunks,model)
# print(embeddings)
# similarities = model.similarity(embeddings, embeddings)
# print(similarities.shape)

In [6]:
# Create the vector store 
def create_vector_store(embeddings):
    """
    Create a vector store using FAISS HNSW index
    
    Args:
        embeddings (np.ndarray): Array of embeddings  
    Returns:
        faiss.Index: HNSW Vector store
    """
    # Get the dimensionality of the embeddings
    dimension = embeddings.shape[1]
    
    # Create HNSW index with proper class name
    M = 100  # Number of connections per layer
    vector_store = faiss.IndexHNSWFlat(dimension, M)
    
    # Set HNSW parameters
    vector_store.hnsw.efConstruction = 100  # Higher = more accurate but slower construction
    vector_store.hnsw.efSearch = 100  # Higher = more accurate but slower search
    
    # Make sure embeddings are float32 before adding to index
    embeddings_32 = embeddings.astype('float32')
    
    # Add vectors to the index
    vector_store.add(embeddings_32)
    print(f"Added {len(embeddings)} vectors to HNSW index")
    
    return vector_store

# vector_store = create_vector_store(embeddings)


In [7]:
# store the vector store 
def store_vector_store(vector_store, file_path):
    """
    Store the vector store to a file
    """
    with open(file_path, 'wb') as f:
        pickle.dump(vector_store, f)

def load_vector_store(file_path):
    """
    Load the vector store from a file
    """
    with open(file_path, 'rb') as f:
        return pickle.load(f)


In [8]:
def query_vector_store(query, vector_store, model, data_chunks):
    """
    Query the vector store for the most similar embeddings
    
    Args:
        query (str): Query to search for
        vector_store (faiss.IndexFlatL2): Vector store to search
        model: SentenceTransformer model
        
    Returns:
        list: List of indices of the most similar embeddings
    """
    query_embedding = model.encode([query])  # get the embedding
    query_embedding = query_embedding.astype('float32')
    distances, indices = vector_store.search(query_embedding, k=5)
    context = ""
    for i in indices[0]:
        context += data_chunks[i] + "\n"
    return context

# query = "merging the ss tables , how?"
# context = query_vector_store(query, vector_store, model, data_chunks)
# print(context)

# test out the embeddings 

In [9]:
# now add the context to the query for the LLM

def add_context_to_query(query, context):
    """
    Add context to the query for the LLM
    
    Args:
        query (str): Query to add context to
        context (str): Context to add to the query
        
    Returns:
        str: Query with context added
    """
    return f"Query: {query}\nContext: {context}"

# query_with_context = add_context_to_query(query, context)
# print(query_with_context)
# # now use the LLM to answer the query 


# add_context_to_query(query, context)

In [10]:
# send the query to the LLM 
def get_ollama_suggestions(query_with_context):
    response = ollama.chat(model='deepseek-r1:14b', messages=[
        {
        'role': 'user',
        'content': query_with_context
        },
    ],  options={"temperature": 0.8}, stream=True )
    
    for chunk in response:
        print(chunk["message"]["content"], end='', flush=True)
        
    return "Streaming complete"

# get_ollama_suggestions(query_with_context)

In [11]:
# text = load_text("DataLake/magma.txt")
# # Create chunks with 500 char window and 100 char overlap
# data_chunks = chunk_text(text, window_size=1000, overlap=200)

# print(f"Created {len(data_chunks)} chunks")
# print(f"\nFirst chunk sample:\n{data_chunks[0][:200]}...")

# model = load_model("nomic-ai/CodeRankEmbed")

# embeddings = embed_chunks(data_chunks,model)
# print(embeddings)
# similarities = model.similarity(embeddings, embeddings)
# print(similarities.shape)

# vector_store = create_vector_store(embeddings)
# store_vector_store(vector_store=vector_store, file_path="VectorStore/magma.pkl")

# vector_store = load_vector_store(file_path="VectorStore/magma.pkl")
# query = "merging the ss tables , how?"
# context = query_vector_store(query, vector_store, model, data_chunks)
# print(context)

# query_with_context = add_context_to_query(query, context)
# print(query_with_context)
# # now use the LLM to answer the query 


# add_context_to_query(query, context)

# get_ollama_suggestions(query_with_context)

In [None]:
# query = "what's the use of magma "
# context = query_vector_store(query, vector_store, model, data_chunks)
# print(context)

# query_with_context = add_context_to_query(query, context)
# print(query_with_context)
# # now use the LLM to answer the query 


# add_context_to_query(query, context)

# get_ollama_suggestions(query_with_context)

: 

In [None]:
text = load_text("DataLake/TAF.txt")
# Create chunks with 500 char window and 100 char overlap
data_chunks = chunk_text(text, window_size=300, overlap=100)

print(f"Created {len(data_chunks)} chunks")
print(f"\nFirst chunk sample:\n{data_chunks[0][:200]}...")

model = load_model("nomic-ai/CodeRankEmbed")

embeddings = embed_chunks(data_chunks,model)
print(embeddings)
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)

vector_store = create_vector_store(embeddings)
store_vector_store(vector_store=vector_store, file_path="VectorStore/TAF_HNSW.pkl")

vector_store = load_vector_store(file_path="VectorStore/TAF_HNSW.pkl")
query = "what's the use of check history_retention function in magma?"
context = query_vector_store(query, vector_store, model, data_chunks)
print(context)

query_with_context = add_context_to_query(query, context)
print(query_with_context)
# now use the LLM to answer the query 


add_context_to_query(query, context)

get_ollama_suggestions(query_with_context)

├── .gitignore
├── .gitmodules
├── LICENSE
├── Makefile
├── README.md
├── TestInput.py
├── b
    ├── resources
    │   ├── 1-node-template.ini
    │   ├── 2-nodes-template.ini
    │   ├── 20-nodes-template.ini
    │   ├── 3-nodes-template.ini
    │   ├── 35-nodes-template.ini
    │   ├── 4-nodes-n1ql-index-template.ini
    │   ├── 4-nodes-n1ql-template.ini
    │   ├── 4-nodes-template-KV.ini
    │   ├── 4-nodes-template-cbas-multi-cluster.ini
    │   ├── 4-nodes-template-cbas.ini
    │   ├── 4-nodes-template-sanity.ini
    │   ├── 4-nodes-template.ini
    │   ├── 5-nodes-bkrs-2clusters.ini
    │   ├── 5-nodes-n1ql-index-template.ini
    │   ├── 5-nodes-template.ini
    │   ├── 6-nodes-template-cbas.ini
    │   ├── 6-nodes-template-ce.ini
    │   ├── 6-nodes-template-multi-cluster.ini
    │   ├── 6-nodes-template-n1ql-xdcr.ini
    │   ├── 6-nodes-template-n1ql.ini
    │   ├── 6-nodes-template-xdcr.ini
    │   ├── 6-nodes-template.ini
    │   ├── 7-nodes-template.ini
    │   ├── 8-nodes-

<All keys matched successfully>


[[-0.08511648  0.6917985  -0.5125859  ...  1.3397381  -0.8910495
  -1.3364283 ]
 [-0.45843035  0.7126437  -0.32616398 ...  0.80734634 -1.1409655
  -1.0520488 ]
 [-0.9916344   0.60744303 -0.4247943  ...  0.3345994  -1.1995372
  -0.9208341 ]
 ...
 [ 0.26201653 -0.3331136  -0.04144606 ... -2.0046778  -0.5560208
  -1.5676023 ]
 [-0.57169724  0.34139484 -0.38562724 ... -1.7988298   0.01153683
   0.2464078 ]
 [ 0.16628617 -0.78377783 -1.1569986  ...  0.3038962  -0.10967173
   0.24487083]]
torch.Size([112470, 112470])


In [65]:
query = "How to use RemoteShellConnection in test and give me usage"
context = query_vector_store(query, vector_store, model, data_chunks)
print(context)

query_with_context = add_context_to_query(query, context)
print(query_with_context)
# now use the LLM to answer the query 


add_context_to_query(query, context)

get_ollama_suggestions(query_with_context)

oteMachineShellConnection(self.server)
 45 |         output, error = remote_client.execute_command(self.command)
 46 |         print(self.server.ip)
 47 |         print("\n".join(output))
 48 |         print("\n".join(error))
 49 |         remote_client.disconnect()
 50 | 
 51 | 
 52 | class ScriptRunner(object):
 53 |     def __init__(self, server, script):
 54 |         self.server = server
 55 |         with open(script) as  f:
 56 |             self.script_content = f.read()
 57 |         self.script_name = "/tmp/" + str(uuid.uuid4())
 58 | 
 59 |     def run(self):
 60 |         remote_client = RemoteMachineShellConnection(self.server)
 61 |         remote_client.create_file(self.script_name, self.script_content)
 62 |         output, error = remote_client.execute_command(
 63 |             "chmod 777 {0} ; {0} ; rm -f {0}".format(self.script_name))
 64 |         print(self.server.ip)
 65 |         print("\n".join(output))
 66 |         print("\n".join(error))
 67 |         remote

'Streaming complete'