In [None]:
from IPython import get_ipython
from IPython.display import display
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
%%capture
!pip install -qU torch==2.3.1 \
transformers==4.41.2 \
accelerate==0.31.0 \
pycaret \
ipywidgets \
transitions

In [None]:
# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import traceback

import os
from pathlib import Path
from pydantic import BaseModel, validator, HttpUrl, constr, ValidationError
from typing import Union

import requests
from bs4 import BeautifulSoup

import pandas as pd

# Set random seed for reproducibility
torch.random.manual_seed(0)

<torch._C.Generator at 0x7c66a010f0b0>

In [None]:
class LanguageModel:
    def __init__(self, model_name):
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype="auto",
            trust_remote_code=True,
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        # Set to evaluation mode
        self.model.eval()

    def generate_text(self, chat_history, generation_args):
        pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
        )
        with torch.no_grad():
          output = pipe(chat_history, **generation_args)
        return output[0]['generated_text'].strip()

In [None]:
class DatasetLocationModel(BaseModel):
    location: Union[HttpUrl, constr(strip_whitespace=True)]

    @validator('location', pre=True)
    def check_location(cls, v):
        if cls.is_valid_local_path(v):
            if cls.has_valid_extension(v):
                return v
            raise ValueError('The local path does not point to a CSV or Parquet file')

        if cls.is_valid_url(v):
            if cls.has_valid_extension(v):
                return v
            raise ValueError('The URL does not point to a CSV or Parquet file')

        raise ValueError('The provided location is not a valid URL or local path')

    @staticmethod
    def is_valid_url(url: str) -> bool:
        try:
            HttpUrl(url=url)
            return True
        except ValidationError:
            return False

    @staticmethod
    def is_valid_local_path(path: str) -> bool:
        return Path(path).exists()

    @staticmethod
    def has_valid_extension(path: str) -> bool:
        valid_extensions = ('.csv', '.parquet')
        return path.lower().endswith(valid_extensions)

    @classmethod
    def validate_location(cls, location: str) -> bool:
        try:
            cls(location=location)
            return True
        except ValidationError:
            return False

In [None]:
class Conversation:
    def __init__(self, language_model, generation_args, max_retries=5):
        self.lm = language_model
        self.generation_args = generation_args
        self.max_retries = max_retries
        self.dataset_url = None
        self.machine_learning_task = None
        self.target_column = None
        self.supported_ml_tasks = ['classification', 'regression', 'clustering']

    def extract_entities(self, user_input):
      """Given user input, extract the dataset URL, machine learning task, and target column."""

      # Generate prompt for entity extraction
      dataset_input_prompt = [
          {"role": "system", "content": "You are a helpful, and accurate, AI assistant. Always follow the instructions provided by user"},
          {"role": "user", "content": f"Given the context: {user_input}, If the context contains a url to a csv or parquet dataset, return the full url as response, otherwise only ouput one word False"},
          ]
      machine_learning_task_input_prompt = [
          {"role": "system", "content": "You are a helpful, and accurate, AI assistant. Always follow the instructions provided by user"},
          {"role": "user", "content": f"Given the context: {user_input}, Identify if the context mentions a machine learning task on the target column in the dataset if yes then return the machine learning task as response, like regression or classification or clustering; otherwise only ouput one word False"},
          ]
      target_column_input_prompt = [
          {"role": "system", "content": "You are a helpful, and accurate, AI assistant. Always follow the instructions provided by user"},
          {"role": "user", "content": f"Given the context: {user_input}, Identify if the context mentions a target column to be used for the machine leraning problem, if yes then return the target column  as response, otherwise only ouput one word False"},
          ]

      if not self.dataset_url:
        self.dataset_url = self.lm.generate_text(dataset_input_prompt, self.generation_args)
        # Check if the URL is valid
        if not DatasetLocationModel.validate_location(self.dataset_url):
          self.dataset_url = None

      if not self.machine_learning_task:
        self.machine_learning_task = self.lm.generate_text(machine_learning_task_input_prompt, self.generation_args)
        # Check if the machine_learning_task is valid
        if not self.machine_learning_task or self.machine_learning_task.lower() not in self.supported_ml_tasks:
          self.machine_learning_task = None

      if not self.target_column:
        self.target_column = self.lm.generate_text(target_column_input_prompt, self.generation_args)
      # Check if the target_column is valid
      if self.dataset_url:
        data = None
        if self.dataset_url.endswith(".csv"):
          data = pd.read_csv(self.dataset_url, nrows= 10)
        else:
          data = pd.read_parquet(self.dataset_url).head(10)

        if not self.target_column in data.columns:
          self.target_column = None

      return None

    def is_chat_successful(self):
      return self.dataset_url and self.machine_learning_task and self.target_column


    def chat(self):
      retries = 0
      while retries < self.max_retries and not (self.dataset_url and self.machine_learning_task and self.target_column):
        user_input = input("")
        self.extract_entities(user_input)
        if self.dataset_url:
          print("Dataset URL:", self.dataset_url)
        else:
          print("Dataset location invalid try again")
        if self.machine_learning_task:
          print("Machine Learning Task:", self.machine_learning_task)
        else:
          print("Please choose machine task from the following: ", self.supported_ml_tasks)
        if self.target_column:
          print("Target:", self.target_column)
        else:
          print("Target columnn not found in the dataset.")

        retries += 1
        if retries == self.max_retries:
          print("Failed to extract entities after multiple retries.")

      return None

## Test Cases:
# 1. The dataset I want to use is Titanic, and the column to classify on is Survived
# 2. I don't know where the data is
# 3. You can find data here: https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv
# 4. I'd like you to apply regression on the target column
# 5. Machine Learning Task: Clustering

In [None]:
from enum import Enum, auto
from transitions import Machine, State
import logging

class NodeState(Enum):
    INITIAL = "initial"
    RUNNING = "running"
    SUCCESS = "success"
    ERROR = "error"
    FAILED = "failed"
    RETRY = "retry"

# # Define the states for the workflow
# class NodeState(Enum):
#     COLLECTING_INPUTS = auto()
#     GENERATING_CODE = auto()
#     EXECUTING_CODE = auto()
#     FIXING_ERRORS = auto()
#     FINISHED = auto()
#     MAX_RETRIES_REACHED = auto()

# Shared context for passing data between nodes
class WorkflowContext:
    def __init__(self, lm, documentation):
        self.inputs = None
        self.code = None
        self.fixed_code = None
        self.execution_success = None
        self.errors = None
        self.lm = lm
        self.library_doc = documentation

# Base class for all nodes in the workflow
class Node:
    def __init__(self, name, context, retries= 5):
        self.name = name
        self.context = context
        self.state = NodeState.INITIAL
        self.max_retries = retries
        self.data = {}

    def run(self):
        raise NotImplementedError("Each node must implement the run method")

# Node for collecting inputs
class CollectInputsNode(Node):
    def run(self):
        # Logic to collect inputs
        self.state = NodeState.RUNNING
        self.context.inputs = self.collect_inputs()
        if self.inputs_collected():
            self.state = NodeState.SUCCESS
            return True
        self.state = NodeState.FAILED
        return False

    def collect_inputs(self):
        # Implement logic to collect inputs
        # Define generation arguments
        entity_extraction_generation_args = {
            "max_new_tokens": 100,
            "return_full_text": False,
            "temperature": 0.0,
            "do_sample": False,
        }
        # Initialize Conversation
        conversor = Conversation(self.context.lm, entity_extraction_generation_args, self.max_retries)
        conversor.chat()
        if conversor.is_chat_successful():
          return {'dataset_url': conversor.dataset_url, 'machine_learning_task': conversor.machine_learning_task, 'target_column': conversor.target_column}
        return None


    def inputs_collected(self):
        # Implement logic to check if inputs are collected
        return self.context.inputs is not None

# Node for generating code
class GenerateCodeNode(Node):
    def run(self):
        # Logic to generate code
        code_gen_prompt = [
            {"role": "system", "content": "You are a helpful, and accurate, AI assistant, that generates bug free executable python code."},
            {"role": "user", "content": "Here is the documentation on how to use the pycaret library for finding best classification model and fit it on new dataset"},
            {"role": "assistant", "content": self.context.library_doc},
            {"role": "user", "content": f"Write code to find best model for {self.context.inputs['machine_learning_task']} for dataset located at url: {self.context.inputs['dataset_url']} and target column:{self.context.inputs['target_column']} using pycaret library, don't fit it on new data. Only generate executable code and nothing else like explanation or reasoning"},
        ]
        code_generation_args = {
            "max_new_tokens": 1000,
            "return_full_text": False,
            "temperature": 0.0,
            "do_sample": False,
        }
        self.context.code = self.generate_code(code_gen_prompt, code_generation_args)
        return True

    def generate_code(self, code_gen_prompt, code_generation_args):
        # Implement code generation logic
        return self.context.lm.generate_text(code_gen_prompt, code_generation_args)

# Node for executing code
class ExecuteCodeNode(Node):
    def run(self):
        # Logic to execute code
        print('inside execute code')
        print(self.context.code)
        success, errors = self.execute_code(self.context.code)
        self.context.execution_success = success
        self.context.errors = errors
        return success

    def execute_code(self, code):
        # Implement code execution logic
        try:
            exec(code)
            return True, None  # Indicate successful execution
        except Exception as e:
            error_message = str(traceback.format_exc())
            print("Error message:\n", error_message)
            return False, [error_message]

# Node for fixing errors
class FixErrorsNode(Node):
    def __init__(self, name, context, max_retries=3):
        super().__init__(name, context)
        self.retries = 0
        self.max_retries = max_retries

    def run(self):
        # Logic to fix errors
        self.context.fixed_code = self.fix_errors(self.context.errors)
        self.context.code = self.context.fixed_code
        self.retries += 1
        return self.retries < self.max_retries

    def fix_errors(self, errors):
        # Implement error fixing logic
        return "fixed code"

In [None]:
# The workflow graph using the transitions library
class Workflow:
    states = [
        'collecting_inputs',
        'generating_code',
        'executing_code',
        'fixing_errors',
        'finished',
        'max_retries_reached'
    ]

    def __init__(self, lm, documentation):
        self.context = WorkflowContext(lm, documentation)
        self.nodes = {
            'collecting_inputs': CollectInputsNode('collect_inputs', self.context),
            'generating_code': GenerateCodeNode('generate_code', self.context),
            'executing_code': ExecuteCodeNode('execute_code', self.context),
            'fixing_errors': FixErrorsNode('fix_errors', self.context),
        }

        # Set up the state machine
        self.machine = Machine(model=self, states=Workflow.states, initial='collecting_inputs')

        # Define transitions between states
        self.machine.add_transition('collect_inputs', 'collecting_inputs', 'generating_code', conditions='run_collecting_inputs')
        self.machine.add_transition('generate_code', 'generating_code', 'executing_code', conditions='run_generating_code')
        self.machine.add_transition('execute_code', 'executing_code', 'finished', conditions='run_executing_code')
        self.machine.add_transition('execution_failed', 'executing_code', 'fixing_errors')
        self.machine.add_transition('fix_errors', 'fixing_errors', 'executing_code', conditions='run_fixing_errors')
        self.machine.add_transition('max_retries', '*', 'max_retries_reached')

    def run_collecting_inputs(self):
        print('here')
        return self.nodes['collecting_inputs'].run()

    def run_generating_code(self):
        return self.nodes['generating_code'].run()

    def run_executing_code(self):
        success = self.nodes['executing_code'].run()
        if success:
            return True
        else:
            self.execution_failed()
            return False

    def run_fixing_errors(self):
        can_retry = self.nodes['fixing_errors'].run()
        if can_retry:
            return True
        else:
            self.max_retries()
            return False

    def run(self):
        while self.state not in ['finished', 'max_retries_reached']:
            print('Inside Workflow, current state is:', self.state)
            if self.state == 'collecting_inputs':
                self.collect_inputs()
            elif self.state == 'generating_code':
                self.generate_code()
            elif self.state == 'executing_code':
                self.execute_code()
            elif self.state == 'fixing_errors':
                self.fix_errors()

In [None]:
model_name = "microsoft/Phi-3-mini-128k-instruct"
lm = LanguageModel(model_name)

config.json:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def fetch_raw_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return None

def parse_html(html_content):
    soup = BeautifulSoup(html_content, 'lxml')

    # Initialize a list to keep all elements in order
    elements = []
    previous_tags = []

    # Extract all relevant tags
    for tag in soup.find_all(['p', 'code', 'div'], recursive=True):
        if tag.name == 'p':
            elements.append(('paragraph', tag.get_text()))
        if tag.name == 'code':
            elements.append(('code', tag.get_text()))
        elif tag.name == 'div' and 'section' in tag.get('class', []):
            elements.append(('section', tag.get_text()))

    extracted_content = ""
    # Print elements in the order they appear
    for i, (element_type, text) in enumerate(elements):
        extracted_content = extracted_content + "\n" + f"{element_type.capitalize()} {i + 1}: {text}"

    return extracted_content

# Example URL
#url = "https://pycaret.gitbook.io/docs/get-started/quickstart"
url = "https://raw.githubusercontent.com/abhimanyu729/GenAIPlayground/main/aisc_demo/pycaret_documentation.html"

# Fetch and parse HTML content
raw_html = fetch_raw_html(url)
documentation_context = None
if raw_html:
    documentation_context = parse_html(raw_html)

In [None]:
workflow = Workflow(lm, documentation_context)

In [None]:
workflow.run()

Inside Workflow, current state is: collecting_inputs
here
The dataset I want to use is Titanic, and the column to classify on is Survived
Dataset location invalid try again
Machine Learning Task: Classification
Target: Survived
You can find data here: https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv
Dataset URL: https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv
Machine Learning Task: Classification
Target: Survived
Inside Workflow, current state is: generating_code
Inside Workflow, current state is: executing_code
inside execute code
import pandas as pd
from pycaret.classification import *

# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')

# Set up the PyCaret environment
clf = setup(data=data, target='Survived')

# Train and compare models
best_model = compare_models()

# Tune the best model
tuned_model = tune_model(best_model)

# Evaluate the tuned model
eval

Unnamed: 0,Description,Value
0,Session id,8856
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 12)"
4,Transformed data shape,"(891, 14)"
5,Transformed train set shape,"(623, 14)"
6,Transformed test set shape,"(268, 14)"
7,Numeric features,6
8,Categorical features,5
9,Rows with missing values,79.5%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.8124,0.8675,0.6857,0.8011,0.7341,0.5912,0.5992,0.245
ridge,Ridge Classifier,0.7578,0.8563,0.4353,0.8681,0.5739,0.4327,0.4849,0.179
et,Extra Trees Classifier,0.6839,0.7789,0.2176,0.8171,0.3332,0.2197,0.3003,0.34
nb,Naive Bayes,0.6727,0.7986,0.1888,0.8123,0.3015,0.1893,0.2738,0.101
knn,K Neighbors Classifier,0.623,0.5918,0.3518,0.5189,0.4178,0.1535,0.161,0.121
lda,Linear Discriminant Analysis,0.6196,0.5263,0.0261,0.06,0.0364,0.0179,0.0208,0.103
dt,Decision Tree Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.101
rf,Random Forest Classifier,0.6164,0.8075,0.0,0.0,0.0,0.0,0.0,0.253
qda,Quadratic Discriminant Analysis,0.6164,0.5538,0.0,0.0,0.0,0.0,0.0,0.102
ada,Ada Boost Classifier,0.6164,0.5,0.0,0.0,0.0,0.0,0.0,0.103


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7778,0.8739,0.75,0.6923,0.72,0.5363,0.5375
1,0.6984,0.6709,0.4583,0.6471,0.5366,0.3226,0.3331
2,0.7778,0.8729,0.6667,0.7273,0.6957,0.5212,0.5224
3,0.7903,0.8796,0.6522,0.75,0.6977,0.5384,0.5415
4,0.8871,0.9211,0.75,0.9474,0.8372,0.7526,0.7646
5,0.871,0.8333,0.7083,0.9444,0.8095,0.7149,0.7319
6,0.8226,0.8925,0.75,0.7826,0.766,0.6232,0.6236
7,0.9194,0.8575,0.8333,0.9524,0.8889,0.826,0.8306
8,0.8387,0.8925,0.75,0.8182,0.7826,0.6548,0.6564
9,0.8387,0.8882,0.7083,0.85,0.7727,0.6493,0.6558


Fitting 10 folds for each of 10 candidates, totalling 100 fits


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [None]:
workflow.state

'finished'

In [None]:
# Why do you think the error exists provide resoning
# use reasoning error and
# input code to fix it.