In [25]:
import os
from dotenv import load_dotenv
load_dotenv()
OPEN_API_KEY = os.getenv("OPENAI_API_KEY")
#MODEL = "gpt-3.5-turbo"
MODEL = "llama2"

In [26]:
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.llms import Ollama 
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.embeddings import OllamaEmbeddings


if MODEL.startswith('gpt'):
    model= ChatOpenAI(openai_api_key=OPENAI_API_KEY, model=MODEL)
    embeddings = OpenAIEmbeddings()

else:
    model= Ollama(model=MODEL)
    embeddings = OllamaEmbeddings()
model.invoke("Tell me a joke")


"\nWhy don't scientists trust atoms? Because they make up everything! üòÇ"

In [27]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser
chain.invoke("What MLB team won the World Series during the COVID-19 pandemic?")

'During the COVID-19 pandemic, which started in 2020 and lasted for several months, there was no Major League Baseball (MLB) World Series played. The MLB season was cancelled in March 2020 due to the pandemic, and the World Series was not held that year.\n\nThe COVID-19 pandemic had a significant impact on sports events worldwide, including MLB, as many countries implemented lockdowns and travel restrictions to slow the spread of the virus. As a result, many sporting events were cancelled or postponed, including the MLB World Series.\n\nIn 2021, the MLB season resumed with a shortened schedule and reduced attendance at games due to ongoing pandemic concerns. The Los Angeles Dodgers went on to win the World Series that year, defeating the Tampa Bay Rays in the Fall Classic.'

In [28]:
#PDF loading for LLM to answer based on the document.
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("Azure_DataBricks and Microsoft Fabric.pdf")
pages = loader.load_and_split()
pages

[Document(metadata={'source': 'Azure_DataBricks and Microsoft Fabric.pdf', 'page': 0}, page_content='Synergy in Analytics: \nUnifying Azure Databricks \nand Microsoft Fabric'),
 Document(metadata={'source': 'Azure_DataBricks and Microsoft Fabric.pdf', 'page': 1}, page_content='Synergy in Analytics: Unifying Azure Databricks and Microsoft Fabric  2\nSynergy in Analytics: \nUnifying Azure Databricks \nand Microsoft Fabric\n3 / \nEmpower modern data analytics with \nAzure Databricks and Microsoft Fabric \n4 / \nSimplify analytics workloads with Azure \nDatabricks and Microsoft Fabric\n10 / \nMedallion architecture in Azure Databricks \nand Microsoft Fabric\n15 / \nUse lakehouse data with Azure Databricks \nand Microsoft Fabric\n22 / \nBetter together: Azure Databricks, Unity \nCatalogue and Microsoft Fabric Purview 26 / \nData Factory and Azure Databricks activity \nin Microsoft Fabric\n28 / \nEnhance organisational capabilities with \ngenerative AI\n33 / \nExplore real-world use cases wi

In [29]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Mary's sister is Susana", question="Who is Mary's sister?")

'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Mary\'s sister is Susana\n\nQuestion: Who is Mary\'s sister?\n'

In [30]:
chain = prompt | model | parser
chain.invoke({
    "context": "Mary's sister is Susana",
    "question": "Who is Mary's sister?"
})

"Of course! Based on the context provided, Mary's sister is Susana."

In [31]:
chain.input_schema.schema()

{'title': 'PromptInput',
 'type': 'object',
 'properties': {'context': {'title': 'Context', 'type': 'string'},
  'question': {'title': 'Question', 'type': 'string'}},
 'required': ['context', 'question']}

In [32]:
from langchain_community.vectorstores import DocArrayInMemorySearch
vectorstore = DocArrayInMemorySearch.from_documents(pages, 
    embedding=embeddings
    )


In [33]:
retriver = vectorstore.as_retriever()
retriver.invoke("databricks")

[Document(metadata={'source': 'Azure_DataBricks and Microsoft Fabric.pdf', 'page': 37}, page_content='Synergy in Analytics: Unifying Azure Databricks and Microsoft Fabric  38\nCreating and modifying a Delta table from Parquet in Azure \nDatabricks with changes reflected in Fabric\nAzure Databricks and Fabric provide a data lakehouse environment that allows businesses to \naccess and analyse their data simultaneously, using different tools. This supports a wide range of \ndata processing activities on the same set of data, making it easier for organisations to manage \nand derive insights from their information efficiently.\n1. Open your Azure Databricks workspace in a browser of your choice and launch a new Azure \nDatabricks notebook.\nFigure 7: Example notebook'),
 Document(metadata={'source': 'Azure_DataBricks and Microsoft Fabric.pdf', 'page': 18}, page_content='Synergy in Analytics: Unifying Azure Databricks and Microsoft Fabric  19\nIntegrate Azure Databricks \nwith Power BI to e

In [34]:
from operator import itemgetter

chain = (
    {
        "context": itemgetter("question") | retriver, 
        "question": itemgetter("question") 
    }
    | prompt 
    | model 
    | parser 
) 

chain.invoke({"question": "what is databricks"})

'Based on the context provided in the document, "Databricks" appears to be a tool that enables users to create and manage data pipelines within a cloud environment. It integrates the capabilities of Azure Data Factory into a more unified and robust framework, allowing users to easily create and manage data pipelines directly within Microsoft Fabric. Additionally, Databricks provides a notebook-style interface for crafting Apache Spark jobs and conducting machine learning experiments, as well as offering advanced visualizations and Markdown text integration.'

In [36]:
questions = ["what is databricks?",
             "How databricks makes the processing fast?",
             "explain little bit more about spark"]
for question in questions:
    print(f"Questions: {question}")
    print(f"Answer: {chain.invoke({'question': question})}")
    print()

Questions: what is databricks?
Answer:  Based on the provided context, Databricks is a platform that combines Azure Databricks and Microsoft Fabric, providing a unified solution for data management and analytics. It allows users to seamlessly switch between different platforms, offering a cohesive and powerful solution for data management and AI projects.

Databricks offers centralized data storage through OneLake, which simplifies data access and governance, enabling Azure Databricks to directly utilize the data for analytical processes. The integration between Azure Databricks and Data Factory in Fabric facilitates streamlined workflows from data ingestion and validation to transformation, supporting data analytics, data science, and AI projects. Additionally, Premium Azure Databricks workspaces support credential passthrough, strengthening the security and ease of access to centralized data for further processing and analysis.

Questions: How databricks makes the processing fast?
An

In [37]:
for s in chain.stream({"question": "What is databricks"}):
    print(s, end="",flush=True)

Based on the context provided in the document, "Databricks" appears to be a product or service that offers a data lakehouse environment for businesses to access and analyze their data simultaneously using different tools. The document mentions that Azure Databricks and Microsoft Fabric provide a data lakehouse environment that allows organizations to manage and derive insights from their information efficiently, and that the integration of Azure Databricks with Power BI enhances data workflows by providing more secure and interactive data visualization experiences directly from the data lake. Additionally, the document mentions that Data Factory and Azure Databricks activity in Microsoft Fabric represents a significant evolution in data processing within cloud environments, integrating the extensive capabilities of Azure Data Factory into a more unified and robust framework.

Therefore, based on the context provided, "Databricks" appears to be a platform or service that enables organiz

In [38]:
chain.batch([{"question": q} for q in question])

["I don't know the answer to your question. The provided document does not contain enough information to answer your query. Could you please provide more context or clarify what you are asking?",
 "I don't know the answer to your question based on the provided document. The document provides information about the integration of Azure Databricks and Microsoft Fabric, but it doesn't contain any specific questions or answers related to your query. If you have any more context or information about the question you are asking, please let me know and I'll do my best to help.",
 'Based on the context provided in the document, the answer to the question "How does Azure Databricks help businesses simplify their data architecture and reduce infrastructure complexity?" is:\n\nAzure Databricks helps businesses simplify their data architecture and reduce infrastructure complexity by providing a unified storage layer that supports a wide range of data formats and structures, significantly boosting t