In [3]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.memory import ChatMemoryBuffer
from time import time

documents = SimpleDirectoryReader("../data").load_data()

# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# ollama
Settings.llm = Ollama(model="llama3.1", request_timeout=360.0)

index = VectorStoreIndex.from_documents(
    documents,
)

memory = ChatMemoryBuffer.from_defaults(token_limit=1500)

chat_engine = index.as_chat_engine(
    chat_mode = "context",
    memory = memory,
    system_prompt = (
        "You are my helpful assitant"
    )
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
t0 = time()
response = chat_engine.chat("From the databricks article, what is the ratio of logged to registered models? ")
t1 = time()

print(response)
print(f"Elapsed time {t1-t0}s")

According to the article, as of January 2023, the ratio of logged to registered models is approximately 2.9:1. This means that for every 2.9 experimental models (logged), about 1 model is registered and considered a candidate for production.
Elapsed time 30.31207275390625s


In [4]:
t0 = time()
response = chat_engine.chat("What does that means?")
t1 = time()

print(response)
print(f"Elapsed time {t1-t0}s")

The article doesn't explicitly mention what it means by "logged" and "registered" models.

However, based on the context, I can make an educated guess. In the context of machine learning and data science, being "logged" might mean that a model has been created, trained, or experimented with (i.e., logged into some kind of system), but not necessarily intended for production use. On the other hand, being "registered" might imply that a model has met certain criteria, such as quality standards, and is deemed suitable for deployment in a production environment.

If I had to hazard a guess based on this context, it's possible that logged models are essentially experimental or prototype models, whereas registered models are those that have been vetted and validated for use in actual applications.
Elapsed time 36.605592250823975s


In [5]:
t0 = time()
response = chat_engine.chat("If you were to explain that to business stakeholder with limited technical knowledge how would you rephrase it?")
t1 = time()

print(response)
print(f"Elapsed time {t1-t0}s")

I'd say something like:

"In the context of this report, 'logged' models refer to experiments or prototypes that have been created and tried out. On the other hand, 'registered' models are those that have met certain standards and are ready for use in real-world applications.

Think of it like a car test drive: logged models would be like taking a car for a spin to see how it handles, while registered models would be like buying a car that's been thoroughly inspected and certified to be road-worthy."

This analogy should give business stakeholders an idea of the difference between these two concepts without requiring technical expertise!
Elapsed time 42.56543779373169s


In [15]:
memory.chat_store.store['chat_history'][1]

ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='According to the article, as of January 2023, the ratio of logged to registered models is approximately 2.9:1. This means that for every 2.9 experimental models (logged), about 1 model is registered and considered a candidate for production.', additional_kwargs={'tool_calls': []})

In [18]:
memory.to_string

<bound method ChatMemoryBuffer.to_string of ChatMemoryBuffer(chat_store=SimpleChatStore(store={'chat_history': [ChatMessage(role=<MessageRole.USER: 'user'>, content='From the databricks article, what is the ratio of logged to registered models? ', additional_kwargs={}), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='According to the article, as of January 2023, the ratio of logged to registered models is approximately 2.9:1. This means that for every 2.9 experimental models (logged), about 1 model is registered and considered a candidate for production.', additional_kwargs={'tool_calls': []}), ChatMessage(role=<MessageRole.USER: 'user'>, content='What does that means?', additional_kwargs={}), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='The article doesn\'t explicitly mention what it means by "logged" and "registered" models.\n\nHowever, based on the context, I can make an educated guess. In the context of machine learning and data science, being "lo

In [20]:
memory.get_all()


[ChatMessage(role=<MessageRole.USER: 'user'>, content='From the databricks article, what is the ratio of logged to registered models? ', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='According to the article, as of January 2023, the ratio of logged to registered models is approximately 2.9:1. This means that for every 2.9 experimental models (logged), about 1 model is registered and considered a candidate for production.', additional_kwargs={'tool_calls': []}),
 ChatMessage(role=<MessageRole.USER: 'user'>, content='What does that means?', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='The article doesn\'t explicitly mention what it means by "logged" and "registered" models.\n\nHowever, based on the context, I can make an educated guess. In the context of machine learning and data science, being "logged" might mean that a model has been created, trained, or experimented with (i.e., logged into some kind o

In [6]:
t0 = time()
response = chat_engine.chat("Which are the fastest growing Data and AI products?")
t1 = time()

print(response)
print(f"Elapsed time {t1-t0}s")

Based on the conversation, it appears that Superconductive is one of the emerging challenger tools in the data integration category, along with Great Expectations as a specific product, although it didn't explicitly state that Superconductive is an emerging challenger tool.
Elapsed time 343.28652358055115s


## Testing Query Engine

In [4]:
import sys
import os
import json

# Add the parent directory (my_project) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [6]:
from llama_index.core import VectorStoreIndex, get_response_synthesizer, SimpleDirectoryReader, Settings
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.memory import ChatMemoryBuffer
from time import time


documents = SimpleDirectoryReader("../data").load_data()

# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# ollama
Settings.llm = Ollama(model="llama3.1", request_timeout=360.0)

# build index
index = VectorStoreIndex.from_documents(documents)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=2,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

# query
response = query_engine.query("What did the author do growing up?")
print(response)

  from .autonotebook import tqdm as notebook_tqdm


There is no mention of the author's childhood or upbringing in the provided context. The text appears to be a report on the state of data and AI, with discussions on language models, LLMs, and their applications. Therefore, it is not possible to answer the query about what the author did growing up based on this information.


In [14]:
for doc in response.source_nodes:
    print(doc.node.extra_info['file_path'])

c:\Users\Carlos\Desktop\RAG-chatbot\code project\notebooks\..\data\databricks-state-of-data-report-010524-v9-FINAL.pdf
c:\Users\Carlos\Desktop\RAG-chatbot\code project\notebooks\..\data\databricks-state-of-data-report-010524-v9-FINAL.pdf


In [16]:
response.source_nodes[0]

NodeWithScore(node=TextNode(id_='fc1a5aab-59cf-41ea-b174-6546d9a3f492', embedding=None, metadata={'page_label': '12', 'file_name': 'databricks-state-of-data-report-010524-v9-FINAL.pdf', 'file_path': 'c:\\Users\\Carlos\\Desktop\\RAG-chatbot\\code project\\notebooks\\..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf', 'file_type': 'application/pdf', 'file_size': 772429, 'creation_date': '2024-09-08', 'last_modified_date': '2024-03-14'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='eadc58e2-a69f-44fd-8cbb-53b07ccf5d98', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '12', 'file_name': 'databricks-state-of-data-report-010524-v9-FINAL.pdf', 'file_path': 'c:\\Users\\Carlos\\Desktop\\

## Testing Memory Objects

In [5]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core.memory import ChatMemoryBuffer
from time import time

documents = SimpleDirectoryReader("../data").load_data()

# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# ollama
Settings.llm = Ollama(model="llama3.1", request_timeout=360.0)

index = VectorStoreIndex.from_documents(
    documents,
)

memory = ChatMemoryBuffer.from_defaults(token_limit=1500)

chat_engine = index.as_chat_engine(
    chat_mode = "context",
    memory = memory,
    system_prompt = (
        "You are my helpful assitant"
    )
)

In [23]:
t0 = time()
response = chat_engine.chat("What articles do I have in the docstore? ")
t1 = time()

print(response)
print(f"Elapsed time {t1-t0}s")

Let me check the document at `c:\Users\Carlos\Desktop\RAG-chatbot\code project\notebooks..\data\databricks-state-of-data-report-010524-v9-FINAL.pdf` for you.

According to my analysis, the document contains two articles:

1. "STATE OF DATA + AI" (Page 1)
2. "STATE OF DATA + AI2" (Page 2)
Elapsed time 25.415090322494507s


In [25]:
print(response)

Let me check the document at `c:\Users\Carlos\Desktop\RAG-chatbot\code project\notebooks..\data\databricks-state-of-data-report-010524-v9-FINAL.pdf` for you.

According to my analysis, the document contains two articles:

1. "STATE OF DATA + AI" (Page 1)
2. "STATE OF DATA + AI2" (Page 2)


In [29]:
memory_dict = memory.to_dict()
memory_dict

{'chat_store': {'store': {'chat_history': [{'role': <MessageRole.USER: 'user'>,
     'content': 'What articles do I have in the docstore? ',
     'additional_kwargs': {}},
    {'role': <MessageRole.ASSISTANT: 'assistant'>,
     'content': 'Let me check the document at `c:\\Users\\Carlos\\Desktop\\RAG-chatbot\\code project\\notebooks..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf` for you.\n\nAccording to my analysis, the document contains two articles:\n\n1. "STATE OF DATA + AI" (Page 1)\n2. "STATE OF DATA + AI2" (Page 2)',
     'additional_kwargs': {'tool_calls': []}}]},
  'class_name': 'SimpleChatStore'},
 'chat_store_key': 'chat_history',
 'token_limit': 1500,
 'class_name': 'ChatMemoryBuffer'}

In [36]:
memory_dict['chat_store']['store']['chat_history'][-1]['content']

'Let me check the document at `c:\\Users\\Carlos\\Desktop\\RAG-chatbot\\code project\\notebooks..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf` for you.\n\nAccording to my analysis, the document contains two articles:\n\n1. "STATE OF DATA + AI" (Page 1)\n2. "STATE OF DATA + AI2" (Page 2)'

In [28]:
new_memory = ChatMemoryBuffer.from_dict(memory_dict)

In [40]:
t0 = time()
streaming_response = chat_engine.stream_chat("Best AI practices")
for token in streaming_response.response_gen:
    print(token, end="")
t1 = time()

print(f"Elapsed time {t1-t0}s")

Based on my knowledge and the context of the document, here are some best AI practices that might be relevant:

1. **Use specialized libraries**: When working with NLP tasks, use specialized libraries like NLTK, Transformers, or FuzzyWuzzy to get the most out of your data.
2. **Leverage domain expertise**: Apply AI and ML techniques to specific industries like Retail and CPG for time series forecasting, or utilize domain-specific knowledge when building chatbots.
3. **Monitor and iterate**: Continuously monitor the performance of your AI models and iteratively improve them based on feedback from users.
4. **Transparency and explainability**: Provide transparent and explainable results to build trust with stakeholders and users.
5. **Human oversight and review**: Implement human oversight and review processes for critical decisions made by AI systems.

Please note that these best practices are general guidelines, and the specific context of your project might require additional consider

In [41]:
for message in memory.get_all():
    print(f"Role: {message.role}, Content: {message.content}")

Role: user, Content: What articles do I have in the docstore? 
Role: assistant, Content: Let me check the document at `c:\Users\Carlos\Desktop\RAG-chatbot\code project\notebooks..\data\databricks-state-of-data-report-010524-v9-FINAL.pdf` for you.

According to my analysis, the document contains two articles:

1. "STATE OF DATA + AI" (Page 1)
2. "STATE OF DATA + AI2" (Page 2)
Role: user, Content: Best AI practices
Role: assistant, Content: Based on the document at `c:\Users\Carlos\Desktop\RAG-chatbot\code project\notebooks..\data\databricks-state-of-data-report-010524-v9-FINAL.pdf`, I found that one of the most popular data science use cases is Natural Language Processing (NLP), which accounted for 49% of all libraries used.

Some best AI practices mentioned in the document include:

* Using specialized Python libraries such as NLTK, Transformers, and FuzzyWuzzy to leverage NLP capabilities
* Applying NLP to use cases like chatbots, research assistance, fraud detection, content generati

In [58]:
from llama_index.core.llms import ChatMessage

history =[]
for message in memory.get_all():
    history.append({'role':message.role.value,'content':message.content})

In [59]:
history

[{'role': 'user', 'content': 'What articles do I have in the docstore? '},
 {'role': 'assistant',
  'content': 'Let me check the document at `c:\\Users\\Carlos\\Desktop\\RAG-chatbot\\code project\\notebooks..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf` for you.\n\nAccording to my analysis, the document contains two articles:\n\n1. "STATE OF DATA + AI" (Page 1)\n2. "STATE OF DATA + AI2" (Page 2)'},
 {'role': 'user', 'content': 'Best AI practices'},
 {'role': 'assistant',
  'content': 'Based on the document at `c:\\Users\\Carlos\\Desktop\\RAG-chatbot\\code project\\notebooks..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf`, I found that one of the most popular data science use cases is Natural Language Processing (NLP), which accounted for 49% of all libraries used.\n\nSome best AI practices mentioned in the document include:\n\n* Using specialized Python libraries such as NLTK, Transformers, and FuzzyWuzzy to leverage NLP capabilities\n* Applying NLP to use 

In [57]:
print(message.role.value)

assistant


In [60]:
new_mem = ChatMemoryBuffer.from_defaults(token_limit=1500)
for message in history:
    buff = ChatMessage(role=message['role'],content=message['content'])
    new_mem.put(buff)
new_mem


ChatMemoryBuffer(chat_store=SimpleChatStore(store={'chat_history': [ChatMessage(role=<MessageRole.USER: 'user'>, content='What articles do I have in the docstore? ', additional_kwargs={}), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Let me check the document at `c:\\Users\\Carlos\\Desktop\\RAG-chatbot\\code project\\notebooks..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf` for you.\n\nAccording to my analysis, the document contains two articles:\n\n1. "STATE OF DATA + AI" (Page 1)\n2. "STATE OF DATA + AI2" (Page 2)', additional_kwargs={}), ChatMessage(role=<MessageRole.USER: 'user'>, content='Best AI practices', additional_kwargs={}), ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Based on the document at `c:\\Users\\Carlos\\Desktop\\RAG-chatbot\\code project\\notebooks..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf`, I found that one of the most popular data science use cases is Natural Language Processing (NLP), which a

In [61]:
new_mem.get_all()

[ChatMessage(role=<MessageRole.USER: 'user'>, content='What articles do I have in the docstore? ', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Let me check the document at `c:\\Users\\Carlos\\Desktop\\RAG-chatbot\\code project\\notebooks..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf` for you.\n\nAccording to my analysis, the document contains two articles:\n\n1. "STATE OF DATA + AI" (Page 1)\n2. "STATE OF DATA + AI2" (Page 2)', additional_kwargs={}),
 ChatMessage(role=<MessageRole.USER: 'user'>, content='Best AI practices', additional_kwargs={}),
 ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content='Based on the document at `c:\\Users\\Carlos\\Desktop\\RAG-chatbot\\code project\\notebooks..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf`, I found that one of the most popular data science use cases is Natural Language Processing (NLP), which accounted for 49% of all libraries used.\n\nSome best AI practice

In [6]:
import os

documents = SimpleDirectoryReader("../data").load_data()

# bge-base embedding model
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")

# ollama
Settings.llm = Ollama(model="llama3.1", request_timeout=360.0)

index = VectorStoreIndex.from_documents(
    documents,
)

memory = ChatMemoryBuffer.from_defaults(token_limit=1500)

chat_engine = index.as_chat_engine(
    chat_mode = "context",
    memory = memory,
    system_prompt = (
        "You are my helpful assitant"
    )
)

stream = chat_engine.stream_chat("Which AI is preffered in the paper")

for token in stream.response_gen:
    print(token, end="")


def get_relative_source_file_paths(response):
    cwd = os.getcwd()
    source_nodes = response.source_nodes
    file_paths = set()
    for node in source_nodes:
        if 'file_path' in node.metadata:
            # Convert absolute path to relative path
            rel_path = os.path.relpath(node.metadata['file_path'], cwd)
            file_paths.add(rel_path)
    return list(file_paths)

source_files = get_relative_source_file_paths(stream)
if source_files:
    print("\nSource documents:")
    for file in source_files:
        print(f"- {file}")



Based on the provided context, it appears that the paper is discussing a "State of Data" report, but it doesn't specify which type of AI (e.g. Machine Learning, Deep Learning, Natural Language Processing, etc.) is preferred.

However, I can suggest that you look for keywords such as "Machine Learning", "Deep Learning", "NLP", or "Computer Vision" in the paper to get a better understanding of the types of AI mentioned. If you provide me with more context or information about the content of the paper, I'd be happy to try and assist you further!
Source documents:
- ..\data\databricks-state-of-data-report-010524-v9-FINAL.pdf


In [79]:

fp = documents[0].metadata['file_path']
cwd = os.getcwd()
rel_path = os.path.relpath(fp, cwd)
rel_path

'..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf'

In [80]:
for doc in documents:
    fp = doc.metadata['file_path']
    doc.metadata['file_path'] = os.path.relpath(fp, cwd)

In [81]:
documents

[Document(id_='ede2e5c7-12e0-475a-a8ca-a1da05e522b8', embedding=None, metadata={'page_label': '1', 'file_name': 'databricks-state-of-data-report-010524-v9-FINAL.pdf', 'file_path': '..\\data\\databricks-state-of-data-report-010524-v9-FINAL.pdf', 'file_type': 'application/pdf', 'file_size': 772429, 'creation_date': '2024-09-08', 'last_modified_date': '2024-03-14'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='STATE OF DATA + AI1\n State of  \nData + AI', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 Document(id_='de5a08d1-24e9-43ed-9346-49b0dfa4f766', embedding=None, metadata={'page_label': '2', 'file_name': 'databricks-state-of-d

In [84]:
documents = SimpleDirectoryReader("../data").load_data()


for doc in documents:
    fp = doc.metadata['file_path']
    doc.metadata['file_path'] = os.path.relpath(fp, cwd)

index = VectorStoreIndex.from_documents(
    documents,
)

memory = ChatMemoryBuffer.from_defaults(token_limit=1500)

chat_engine = index.as_chat_engine(
    chat_mode = "context",
    memory = memory,
    system_prompt = (
        "You are my helpful assitant"
    )
)

stream = chat_engine.stream_chat("Best aproach to handle ETL pipelines according to the paper")

for token in stream.response_gen:
    print(token, end="")

Based on the provided context, it seems that the paper recommends a data lakehouse approach for handling ETL (Extract, Transform, Load) pipelines.

Here are some key takeaways:

1. **Migrating to a Lakehouse**: The paper suggests that companies are increasingly moving away from traditional data warehouses and migrating to a lakehouse platform, which supports advanced use cases and DS/ML.
2. **Unified Data Platform**: The report mentions the importance of unifying data platforms to reduce costs, suggesting that a lakehouse approach can help streamline ETL pipelines by consolidating multiple data sources into one platform.
3. **Data Integration**: The paper highlights the growth in demand for data integration products, which enable companies to integrate vast amounts of upstream and downstream data in one consolidated view.

Considering these points, I would recommend the following best practices for handling ETL pipelines:

1. **Migrate to a Lakehouse Platform**: Consider moving your ET

In [8]:
documents = SimpleDirectoryReader("../data").load_data()
cwd = cwd = os.getcwd()

for doc in documents:
    fp = doc.metadata['file_path']
    doc.metadata['file_path'] = os.path.relpath(fp, cwd)

index = VectorStoreIndex.from_documents(
    documents,
)

memory = ChatMemoryBuffer.from_defaults(token_limit=1500)

chat_engine = index.as_chat_engine(
    chat_mode = "context",
    memory = memory,
    system_prompt = (
        "You are my helpful assitant"
    )
)

stream = chat_engine.stream_chat("Retrieve the filepath")

for token in stream.response_gen:
    print(token, end="")

The file path is: ..\data\databricks-state-of-data-report-010524-v9-FINAL.pdf

## test model

In [1]:
import sys
import os
import json

# Add the parent directory (my_project) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from models import *
slm = SLM(model_name='llama3.1')
conversation = [{'role': 'user', 'content': "Hello"}]
try:
    stream = slm.chat(chat_history=conversation, data_folder=os.getcwd(),is_stream=True)
except Exception as e:
    print(e)

print(slm.create_title("Hello"))
out=""
for chunk in stream.response_gen:
    print(chunk, end="")
    out+= chunk

  from .autonotebook import tqdm as notebook_tqdm


No cuda found, falling back to cpu

        Context:

        User input: Hello
        
"HELLO"
Based on the context, I can help you with a few things.

Since we have a conversation ID created by `ss.create_conversation('test', chat_history)`, it seems like we're working with a conversation API. 

If you'd like to continue the conversation and add another message to the chat history, you could do something like this:

```python
new_message = {'role': 'user', 'content': "How are you?"}
ss.add_message(conversation_id, new_message)
```

This would add a new message to the conversation with the role of 'user' and content 'How are you?'