In [17]:
topics = {
    "Topic 1 Framing ML Problems": [
        "Introduction to Machine Learning in Business",
        "Translating Business Use Cases",
        "Machine Learning Approaches",
        "ML Success Metrics",
        "Responsible AI Practices"
    ],
    "Topic 2 Exploring Data and Building Data Pipelines": [
        "Data Collection and Cleaning",
        "Visualization Techniques",
        "Organizing Training Datasets",
        "TensorFlow Data Validation (TFDV)"
    ],
    "Topic 3 Feature Engineering and Processing": [
        "Feature Selection and Extraction",
        "Dimensionality Reduction",
        "Feature Engineering with BigQuery ML",
        "Handling Categorical and Numerical Data",
        "Feature Store with Vertex AI"
    ],
    "Topic 4 Model Training and Evaluation": [
        "Model Training with Vertex AI",
        "Hyperparameter Tuning with Vertex Vizier",
        "Evaluating Model Performance",
        "Managing Model Versioning",
        "Cross-validation and Regularization Techniques"
    ],
    "Topic 5 Deploying and Managing Models": [
        "Deployment with Vertex AI",
        "Batch and Online Predictions",
        "Monitoring and Managing Model Drift",
        "Model Interpretability and Explainability",
        "A/B Testing and Rollbacks"
    ],
    "Topic 6 Machine Learning Operations (MLOps)": [
        "Introduction to MLOps",
        "CI/CD for Machine Learning",
        "Data and Model Lineage",
        "ML Pipelines on Google Cloud",
        "Monitoring ML Pipelines"
    ],
    "Topic 7 Advanced ML Topics": [
        "Distributed Training",
        "Transfer Learning",
        "Federated Learning",
        "AutoML and Custom ML Models",
        "Reinforcement Learning"
    ]
}


RAG with AutoGen

In [18]:
import os
import chromadb
import json
import autogen
import streamlit as st
from autogen import AssistantAgent, UserProxyAgent
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
from autogen.agentchat.contrib.vectordb.chromadb import ChromaVectorDB
from chromadb.utils import embedding_functions

from dotenv import load_dotenv

load_dotenv()

True

In [2]:
OPENAI_API_KEY = os.getenv("PERSONAL_OAI_API_KEY")
MODEL_NAME = "gpt-3.5-turbo"

config_list = [{"model": MODEL_NAME,"api_key": OPENAI_API_KEY}]
llm_config = {"config_list": config_list, "timeout": 360}

In [4]:
CHROMA_DB_PATH="/tmp/chromadb"
CHROMA_COLLECTION="autogen-rag-chroma"

chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
collection = chroma_client.get_or_create_collection(name=CHROMA_COLLECTION)

In [5]:
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=os.getenv("OPENAI_API_KEY"),
                model_name="text-embedding-ada-002")

In [6]:
vector_db = ChromaVectorDB(path=CHROMA_DB_PATH, embedding_function=openai_ef)

In [7]:
assistant = AssistantAgent(
    name="assistant",
    system_message="You are a helpful assistant.",
    llm_config={
        "timeout": 600,
        "config_list": config_list,
    },
)



In [19]:
ragproxyagent = RetrieveUserProxyAgent(
    name="ragproxyagent",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=10,
    retrieve_config={
        "task": "qa",
        "docs_path": [
            "./docs"
        ],
        "chunk_token_size": 2000,
        "model": MODEL_NAME,
        "vector_db": vector_db,
        "overwrite": False,  # set to True if you want to overwrite an existing collection
        "get_or_create": True,  # set to False if don't want to reuse an existing collection
        "collection_name": CHROMA_COLLECTION,
        "embedding_function": openai_ef,
        "context_max_tokens": 100000,
    },
    code_execution_config={"work_dir": "output", "use_docker": False}, # Takes data from a directory
)

In [20]:
# Initialize a dictionary to store the JSON output
output_json = {}

In [21]:
# Loop through each topic and subtopic to query the RAG agent
for topic, subtopics in topics.items():
    # Initialize the topic entry in the output JSON
    output_json[topic] = {}
    
    for subtopic in subtopics:
        # Define the query for each subtopic
        qa_problem = f"Provide 3 bullet points on {subtopic} in the context of {topic}"

        # Execute the retrieval using the RAG agent
        chat_result = ragproxyagent.initiate_chat(
            assistant, message=ragproxyagent.message_generator, problem=qa_problem, n_results=2
        )
        
        # Check for content in the chat history
        if chat_result.chat_history[-1]['content'] == 'UPDATE CONTEXT':
            content = None
            for entry in reversed(chat_result.chat_history[:-1]):  # Exclude the last entry
                if entry['role'] == 'user':
                    content = entry['content']
                    break
        else:
            content = chat_result.chat_history[-1]['content']
        
        # Parse the content into bullet points
        bullet_points = content.split("\n")[:3] if content else ["No data available"] * 3

        # Add the subtopic and its bullet points to the output JSON
        output_json[topic][subtopic] = bullet_points

# Check the last entry for "UPDATE CONTEXT", otherwise print the last entry
if chat_result.chat_history[-1]['content'] == 'UPDATE CONTEXT':
    for entry in reversed(chat_result.chat_history[:-1]):  # Exclude the last entry
        if entry['role'] == 'user':
            print(entry['content'])  # Output the content of the found entry
            break
else:
    print(chat_result.chat_history[-1]['content'])

2024-11-12 17:04:30,488 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - [32mUse the existing collection `autogen-rag-chroma`.[0m
2024-11-12 17:04:30,496 - autogen.agentchat.contrib.retrieve_user_proxy_agent - INFO - Found 0 chunks.[0m


Trying to create collection.
VectorDB returns doc_ids:  [['dc475658', '5e9b0715']]
[32mAdding content of doc dc475658 to context.[0m
[32mAdding content of doc 5e9b0715 to context.[0m
[33mragproxyagent[0m (to assistant):

You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
You must give as short an answer as possible.

User's question is: Provide 3 bullet points on Introduction to Machine Learning in Business in the context of Topic 1 Framing ML Problems

Context is: Framing ML Problems Chapter  
1
Mona, Mona, and Pratap Ramamurthy. Official Google Cloud Certified Professional Machine Learning Engineer Study Guide, John Wiley & Sons, Incorporated,
         2023. ProQuest Ebook Central, http://ebookcentral.proquest.com/lib/singaporetechsg/detail.action?docID=30835935.
Created from singaporetec

In [22]:
# Output the JSON structure
output_json_str = json.dumps(output_json, indent=4)
print(output_json_str)
# Serializing json
 
# Writing to sample.json
with open("generated_content.json", "w") as outfile:
    outfile.write(output_json_str)

{
    "Topic 1 Framing ML Problems": {
        "Introduction to Machine Learning in Business": [
            "- Machine learning enables businesses to harness predictive analytics for decision-making.",
            "- Businesses can utilize machine learning for personalized recommendations and customer segmentation.",
            "- Machine learning applications in business include fraud detection, customer sentiment analysis, and demand forecasting."
        ],
        "Translating Business Use Cases": [
            "TERMINATE"
        ],
        "Machine Learning Approaches": [
            "- Supervised Learning",
            "- Unsupervised Learning",
            "- Semi-supervised Learning"
        ],
        "ML Success Metrics": [
            "1. Recall: Measures the percentage of positive data points correctly predicted.",
            "2. Precision: Quantifies the percentage of positive predictions that were actually correct.",
            "3. F1 Score: Harmonic mean of precisio