In [69]:
import chromadb
import uuid
from chromadb.config import Settings
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
import ollama
import json

In [33]:
# Create database
client = chromadb.PersistentClient(path="./chroma_vectorDB", settings=Settings(anonymized_telemetry=False))

In [None]:
# Loader to load from directory
loader = DirectoryLoader(path="./docs", glob="./*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()

In [None]:
# documents

In [None]:
# Split the docs into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunked_docs = text_splitter.split_documents(documents=documents)

In [None]:
# chunked_docs

In [None]:
chunked_docs_list = [item.page_content for item in chunked_docs]

In [None]:
# chunked_docs_list

In [None]:
embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

  from tqdm.autonotebook import tqdm, trange


In [None]:
document_embeddings = embedding_model.embed_documents(chunked_docs_list)

In [None]:
# len(document_embeddings[0])

In [37]:
# create collection
collection = client.create_collection(name="chatbot")

In [38]:
# insert documents, embedding along with uuids into the collection:
collection.add(ids=[f"{uuid.uuid4()}" for _ in range(len(chunked_docs_list))], documents=chunked_docs_list, embeddings=document_embeddings)

In [123]:
# Query
query = "Explain Energy Minimization Network."

In [124]:
query_embedding = embedding_model.embed_query(query)

In [97]:
# len(query_embedding)

In [125]:
query_result = collection.query(query_embeddings=query_embedding)

In [99]:
# query_result['documents']

In [126]:
prompt = f"Given the query, '{query}' and the context, '{query_result['documents'][0][0]}'. Answer the query on the basis of given context or use your intelligence."

In [107]:
# prompt

In [127]:
response = ollama.chat(model='gemma:2b', messages=[
    {
        'role': 'user',
        'content':prompt,
        "format": "json",
        "stream": False,
    }
])

In [89]:
# response

In [128]:
# Parse the JSON string
json_data = json.loads(json.dumps(response))
# Access the 'content' field
content = json_data['message']['content']
# Print the content in a readable format
print(content)

Sure, here's the answer:

The Energy Minimization Network (EMN) is a machine learning model that aims to learn relationships between sub-sequences of words by minimizing the energy function E. The energy function takes in a sub-sequence of words (mapped to their feature vectors) and outputs an energy value, which measures how likely it is for that sub-sequence to appear in a natural language text.

EMN utilizes a feature vector representation for the output words, which allows it to consider the semantic and syntactic similarities between different words. This is in contrast to previous models that relied solely on lexical representations.

The EMN also takes into account the conditional probabilities of the output words, represented by the vector of biases b. These biases are learned during the training process and contribute to the model's ability to accurately predict the energy function.

EMN can be viewed as a non-normalized log-probability measure, where the energy function assig