In [4]:
from langchain_qdrant import QdrantVectorStore
from langchain_ollama import OllamaEmbeddings
from langchain_qdrant import FastEmbedSparse, RetrievalMode
from langchain_groq import ChatGroq
from langchain.retrievers.document_compressors import FlashrankRerank

from mlops_agents.agent import MlOpsAgent
from dotenv import load_dotenv
import os

In [5]:
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")

embeddings = OllamaEmbeddings(
    model="nomic-embed-text:v1.5"
)

sparse_embeddings = FastEmbedSparse(
    model_name="Qdrant/bm25"
)

qdrant = QdrantVectorStore.from_existing_collection(
    embedding=embeddings,
    sparse_embedding=sparse_embeddings,
    url=QDRANT_URL,
    prefer_grpc=True,
    api_key=QDRANT_API_KEY,
    collection_name="mlops_document",
    retrieval_mode=RetrievalMode.HYBRID,
)

hybrid_rerank_qdrant_retriever = qdrant.as_retriever(
    search_type="similarity", 
    search_kwargs={"k": 20},
)

groq_llama3_1_70b = ChatGroq(
    model="llama-3.1-70b-versatile",
    temperature=0,
    api_key=GROQ_API_KEY
)

Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

In [2]:
mlops_agent = MlOpsAgent()

In [6]:
question = "What is data leakage ?"
graph_state = {
    "question":question,
    "rag_answer":"None",
    "supervisor_route_choice":"None",
    "hybrid_rerank_qdrant_retriever":hybrid_rerank_qdrant_retriever,
    "groq_llama3_1_70b": groq_llama3_1_70b
}
result = mlops_agent(graph_state)
print(result['rag_answer'])

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"


**What is Data Leakage?**

Data leakage refers to the phenomenon where a form of the label or target variable "leaks" into the set of features used for making predictions, and this same information is not available during inference. This can cause machine learning models to fail in unexpected and spectacular ways, even after extensive evaluation and testing.

**Examples of Data Leakage**

1. **Predicting COVID-19 risks from medical scans**: A model was trained on a mix of scans taken when patients were lying down and standing up. The model learned to predict serious COVID-19 risk from a person's position, which is not a relevant feature for making predictions.
2. **Predicting COVID-19 risks from text font**: A model was trained on scans labeled with different fonts from various hospitals. The model learned to predict COVID-19 risk from the font used, which is not a relevant feature for making predictions.
3. **Predicting lung cancer from CT scans**: A model was trained on CT scans from

In [7]:
result

{'messages': [],
 'question': 'What is data leakage ?',
 'rag_answer': '**What is Data Leakage?**\n\nData leakage refers to the phenomenon where a form of the label or target variable "leaks" into the set of features used for making predictions, and this same information is not available during inference. This can cause machine learning models to fail in unexpected and spectacular ways, even after extensive evaluation and testing.\n\n**Examples of Data Leakage**\n\n1. **Predicting COVID-19 risks from medical scans**: A model was trained on a mix of scans taken when patients were lying down and standing up. The model learned to predict serious COVID-19 risk from a person\'s position, which is not a relevant feature for making predictions.\n2. **Predicting COVID-19 risks from text font**: A model was trained on scans labeled with different fonts from various hospitals. The model learned to predict COVID-19 risk from the font used, which is not a relevant feature for making predictions.\n

# 2. Test FastAPI

In [8]:
import requests

user_query = "What is data distribution drift?"

url = "http://127.0.0.1:8000/chatResponse"  # Replace with your server URL if deployed

# Define the JSON payload
payload = {
    "user_query": user_query
}

# Make the POST request
response = requests.post(url, json=payload).json()

In [10]:
print(response['response'])

Data distribution drift refers to a change in the underlying distribution of the data, which can affect the performance of a machine learning model. This can occur in various forms, including:

1. **Covariate shift**: When the probability density of the input (P(X)) changes, but the conditional probability of the output given the input (P(Y|X)) remains the same.
2. **Label shift**: When the probability density of the output (P(Y)) changes, but the conditional probability of the input given the output (P(X|Y)) remains the same.
3. **Concept drift**: When the conditional probability of the output given the input (P(Y|X)) changes, but the probability density of the input (P(X)) remains the same.

Data distribution drift can be detected by monitoring the model's performance metrics, such as accuracy, F1 score, recall, and AUC-ROC, in production. However, when ground truth labels are unavailable or delayed, other distributions of interest can be monitored, including the input distribution (