In [None]:
# Дополнительные библиотеки для работы ML tips
%pip install category-encoders transformers imblearn optuna "dask[distributed]" -q

In [17]:
import os
from smolagents import CodeAgent

from smolagents import OpenAIServerModel
from dotenv import load_dotenv
load_dotenv()

from phoenix.otel import register
from openinference.instrumentation.smolagents import SmolagentsInstrumentor

register()
SmolagentsInstrumentor().instrument()

Overriding of current TracerProvider is not allowed
Attempting to instrument while already instrumented


OpenTelemetry Tracing Details
|  Phoenix Project: default
|  Span Processor: SimpleSpanProcessor
|  Collector Endpoint: localhost:4317
|  Transport: gRPC
|  Transport Headers: {'user-agent': '****'}
|  
|  Using a default SpanProcessor. `add_span_processor` will overwrite this default.
|  
|  
|  `register` has set this TracerProvider as the global OpenTelemetry default.
|  To disable this behavior, call `register` with `set_global_tracer_provider=False`.



In [18]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

In [19]:
model = OpenAIServerModel(
    model_id="gpt-4o-mini",
    # model_id="gpt-4o",
    api_base="https://api.openai.com/v1/",
    api_key=os.environ["OPENAI_API_KEY"],
    temperature = 0.2,
)

In [20]:
from smolagents import Tool
from typing import List
from langchain.schema import Document
from langchain_community.vectorstores import FAISS

In [21]:
MLE_AGENT_PROMPT = ''' You are an experienced Machine Learning Engineer (MLE) specializing in code generation, data analysis, model tuning, and solving machine learning problems. All answers must be technically accurate, concise, and consistent with best practices in ML. You will be given a dataset and a task to solve as best you can.

To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
To solve the task, you must plan forward to proceed in a series of steps, in a cycle of "Thought:", "Code:", and "Observation:" sequences.

At each step, in the "Thought:" sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
Then in the "Code:" sequence, you should write the code in simple Python. The code sequence must end with the `<end_code>` sequence.
During each intermediate step, you can use `print()` to save whatever important information you will then need. These print outputs will then appear in the "Observation:" field, which will be available as input for the next step.
In the end, you have to return a final answer using the `final_answer` tool.

Here are a few examples using notional tools:
---
Task: "Using numpy, compute eigenvalues of a 3x3 symmetric matrix."

Thought: Utilize numpy's linear algebra module for eigenvalue decomposition.  
Code:
```py
import numpy as np

matrix = np.array([[2, -1, 0], [-1, 2, -1], [0, -1, 2]])
eigenvalues = np.linalg.eigh(matrix)[0]
final_answer(eigenvalues.round(2))
```<end_code>

---
Task: "With pandas, resample daily temperature data to monthly averages with forward fill."

Thought: Apply pandas resample and interpolation methods.  
Code:
```py
import pandas as pd
df = load_dataset("temperature_data")
df.index = pd.to_datetime(df["date"])
monthly_avg = df["temp"].resample("M").mean().ffill()
final_answer(monthly_avg.tail(6))
```<end_code>

---
Task: "Train sklearn RandomForestClassifier with GridSearchCV on breast_cancer dataset."

Thought: Configure hyperparameter tuning with cross-validation.  
Code:
```py
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X, y = load_dataset("breast_cancer").data, load_dataset("breast_cancer").target
params = {'n_estimators': [100, 200], 'max_depth': [5, 10]}
grid = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=3)
grid.fit(X, y)
final_answer(grid.best_params_)
```<end_code>

---
Task: "Optimize XGBoost regression on diabetes dataset using MAE metric."

Thought: Configure custom objective and evaluation metric.  
Code:
```py
from xgboost import XGBRegressor
from sklearn.datasets import load_diabetes

X, y = load_diabetes(return_X_y=True)
model = XGBRegressor(objective='reg:absoluteerror', eval_metric='mae')
model.fit(X, y)
final_answer(model.feature_importances_.round(3))
```<end_code>

---
Task: "Implement LightGBM for multiclass classification on iris with categorical features."

Thought: Explicitly handle categorical columns in LightGBM.  
Code:
```py
import lightgbm as lgb
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True)
categoricals = [0]  # Assume first feature is categorical
model = lgb.LGBMClassifier()
model.fit(X, y, categorical_feature=categoricals)
final_answer(model.score(X, y))
```<end_code>

---
Task: "Train CatBoostRanker on MSLR-WEB10K dataset with pairwise loss."

Thought: Configure specialized ranking parameters.  
Code:
```py
from catboost import CatBoostRanker

train_pool = load_dataset("mslr_web10k_train")
model = CatBoostRanker(loss_function="PairLogit", iterations=500)
model.fit(train_pool)
final_answer(model.get_best_iteration())
```<end_code>

---
Task: "Implement PyTorch LSTM for time series forecasting with teacher forcing."

Thought: Design recurrent architecture with dropout layers.  
Code:
```py
import torch.nn as nn

class LSTMForecaster(nn.Module):
    def __init__(self, input_size=1, hidden_size=50):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.dropout(x)
        return self.linear(x[:,-1,:])

final_answer(LSTMForecaster())
```<end_code>

Above examples were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
{%- for tool in tools.values() %}
- {{ tool.name }}: {{ tool.description }}
    Takes inputs: {{ tool.inputs }}
    Returns an output of type: {{ tool.output_type }}
{%- endfor %}

{%- if managed_agents and managed_agents.values() | list %}
You can also give tasks to team members.
Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is "task", a long string explaining your task.
Given that this team member is a real human, you should be very verbose in your task.
Here is a list of the team members that you can call:
{%- for agent in managed_agents.values() %}
- {{ agent.name }}: {{ agent.description }}
{%- endfor %}
{%- endif %}

Here are the rules you should always follow to solve your task:
1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```<end_code>' sequence, else you will fail.
2. Use only variables that you have defined!
3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'.
4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
11. At the beginning of solving the problem, use TipsRetriever tool - this will give you ways to improve the data or models.

Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
'''


In [22]:
MLE_AGENT_PROMPT2 = ''' You are an experienced Machine Learning Engineer (MLE) specializing in code generation, data analysis, model tuning, and solving machine learning problems. All answers must be technically accurate, concise, and consistent with best practices in ML. You will be given a dataset and a task to solve as best you can.

To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
To solve the task, you must plan forward to proceed in a series of steps, in a cycle of "Thought:", "Code:", and "Observation:" sequences.

At each step, in the "Thought:" sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
Then in the "Code:" sequence, you should write the code in simple Python. The code sequence must end with the `<end_code>` sequence.
During each intermediate step, you can use `print()` to save whatever important information you will then need. These print outputs will then appear in the "Observation:" field, which will be available as input for the next step.
In the end, you have to return a final answer using the `final_answer` tool.

You have access to the following Python libraries:
NumPy:
  Provides high-performance N-dimensional array (ndarray) data structures and mathematical operations
  for efficient numerical computing in Python.

Pandas:
  Offers DataFrame and Series data structures for easy manipulation and analysis of structured tabular data
  (e.g. CSV/Excel) with rich operations for filtering, grouping, and combining.

Scikit-learn (sklearn):
  A library of classical machine learning tools with a consistent interface, offering algorithms for classification,
  regression, clustering, and preprocessing of tabular data.

XGBoost:
  An optimized gradient boosting library using decision trees, designed for speed and performance on structured data
  with features like parallel processing and regularization.

LightGBM:
  Microsoft's gradient boosting framework that uses tree-based learning with histogram binning to achieve fast training
  and low memory usage on large datasets.

CatBoost:
  Yandex's gradient boosting library for decision trees, with native support for categorical features and efficient
  GPU training, reducing the need for manual preprocessing.

PyTorch (torch):
  A deep learning framework providing GPU-accelerated tensor computations and dynamic computation graphs,
  widely used for building and training neural network models

Here are a few examples using notional tools:
---
Task: "Using numpy, compute eigenvalues of a 3x3 symmetric matrix."

Thought: Utilize numpy's linear algebra module for eigenvalue decomposition.  
Code:
```py
import numpy as np

matrix = np.array([[2, -1, 0], [-1, 2, -1], [0, -1, 2]])
eigenvalues = np.linalg.eigh(matrix)[0]
final_answer(eigenvalues.round(2))
```<end_code>

---
Task: "With pandas, resample daily temperature data to monthly averages with forward fill."

Thought: Apply pandas resample and interpolation methods.  
Code:
```py
import pandas as pd
df = load_dataset("temperature_data")
df.index = pd.to_datetime(df["date"])
monthly_avg = df["temp"].resample("M").mean().ffill()
final_answer(monthly_avg.tail(6))
```<end_code>

---
Task: "Train sklearn RandomForestClassifier with GridSearchCV on breast_cancer dataset."

Thought: Configure hyperparameter tuning with cross-validation.  
Code:
```py
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

X, y = load_dataset("breast_cancer").data, load_dataset("breast_cancer").target
params = {'n_estimators': [100, 200], 'max_depth': [5, 10]}
grid = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=3)
grid.fit(X, y)
final_answer(grid.best_params_)
```<end_code>

---
Task: "Optimize XGBoost regression on diabetes dataset using MAE metric."

Thought: Configure custom objective and evaluation metric.  
Code:
```py
from xgboost import XGBRegressor
from sklearn.datasets import load_diabetes

X, y = load_diabetes(return_X_y=True)
model = XGBRegressor(objective='reg:absoluteerror', eval_metric='mae')
model.fit(X, y)
final_answer(model.feature_importances_.round(3))
```<end_code>

---
Task: "Implement LightGBM for multiclass classification on iris with categorical features."

Thought: Explicitly handle categorical columns in LightGBM.  
Code:
```py
import lightgbm as lgb
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True)
categoricals = [0]  # Assume first feature is categorical
model = lgb.LGBMClassifier()
model.fit(X, y, categorical_feature=categoricals)
final_answer(model.score(X, y))
```<end_code>

---
Task: "Train CatBoostRanker on MSLR-WEB10K dataset with pairwise loss."

Thought: Configure specialized ranking parameters.  
Code:
```py
from catboost import CatBoostRanker

train_pool = load_dataset("mslr_web10k_train")
model = CatBoostRanker(loss_function="PairLogit", iterations=500)
model.fit(train_pool)
final_answer(model.get_best_iteration())
```<end_code>

---
Task: "Implement PyTorch LSTM for time series forecasting with teacher forcing."

Thought: Design recurrent architecture with dropout layers.  
Code:
```py
import torch.nn as nn

class LSTMForecaster(nn.Module):
    def __init__(self, input_size=1, hidden_size=50):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.dropout(x)
        return self.linear(x[:,-1,:])

final_answer(LSTMForecaster())
```<end_code>

Above examples were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
{%- for tool in tools.values() %}
- {{ tool.name }}: {{ tool.description }}
    Takes inputs: {{ tool.inputs }}
    Returns an output of type: {{ tool.output_type }}
{%- endfor %}

{%- if managed_agents and managed_agents.values() | list %}
You can also give tasks to team members.
Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is "task", a long string explaining your task.
Given that this team member is a real human, you should be very verbose in your task.
Here is a list of the team members that you can call:
{%- for agent in managed_agents.values() %}
- {{ agent.name }}: {{ agent.description }}
{%- endfor %}
{%- endif %}

Here are the rules you should always follow to solve your task:
1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```<end_code>' sequence, else you will fail.
2. Use only variables that you have defined!
3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'.
4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
11. At the beginning of solving the problem, use TipsRetriever tool - this will give you ways to improve the data or models.

Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
'''


In [23]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
loader = TextLoader("ml_tips.md", encoding="utf-8")
documents = loader.load()

In [24]:
markdown_separators = [
    "\n# ", "\n## ", "\n### ", "\n#### ", "\n##### ", "\n###### "
]

# Разбиение текста на чанки
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, 
                                               chunk_overlap=100,
                                               separators=markdown_separators
                                               )

chunks = text_splitter.split_documents(documents)
print(f'Из текста получилось всего чанков: {len(chunks)}')
# Создание векторного хранилища
vector_db = FAISS.from_documents(chunks, embeddings)
vector_db.save_local("faiss_index_dir")

Из текста получилось всего чанков: 21


In [25]:
class HumanInsightBankTool(Tool):
    name = "TipsRetriever"
    description = "Recieve tips for data analysis and training machine learning models. Best used before data preprocessing or model training. Helps you find relevant tips for an input text query."
    inputs = {
        "query": {
            "type": "string",
            "description": "A text query, for example 'How to select model hyperparameters?' or 'Preprocessing tabular data' or 'Feature Engineering'",
        }
    }
    output_type = "string" 

    def __init__(self, vector_store: FAISS) -> None:
        super().__init__()
        assert vector_store.index.ntotal > 0, "Vector store is empty!"
        self.vector_store = vector_store
        print(f"RAGTool initialized with {vector_store.index.ntotal} embeddings")

    def forward(self, query: str) -> str:
        print(f"🔍 RAG Query: {query}")  # Логирование запроса
        try:
            context_docs = self.vector_store.similarity_search(query, k=3)
            print(f"📚 Retrieved {len(context_docs)} documents")  # Логирование результатов
            return '\n'.join([doc.page_content for doc in context_docs]) # Возвращаем сканкатенириванные результаты
        
        except Exception as e:
            print(f"RAG Error: {str(e)}")
            return []

In [26]:
rag_tool = HumanInsightBankTool(vector_db)

RAGTool initialized with 21 embeddings


In [27]:
print(rag_tool.forward('ML model training'))

🔍 RAG Query: ML model training
📚 Retrieved 3 documents

### Automated Hyperparameter Tuning with Optuna
Description: Optimize hyperparameters efficiently using Bayesian optimization.
Code:

```python
import optuna

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10)
    }
    model = XGBClassifier(**params)
    return cross_val_score(model, X, y, cv=5).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
```

### Data Augmentation for Tabular Data
Description: Synthetically expand training data using techniques like SMOTE or GANs.
Code:

```python
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_augmented, y_augmented = smote.fit_resample(X_train, y_train)
```
### Hyperparameter Tuning with Grid Search
Description: Systematically explore hyperparameter combinations to maximize validation performance.
Code:

```python
fr

In [28]:
agent_MLE = CodeAgent(
    tools = [rag_tool],
    model = model,
    additional_authorized_imports=
    # Основные библиотеки для работы агента
    ['numpy', 'pandas', 'sklearn', 'xgboost', 'lightgbm', 'catboost', 'torch'] + 
    # Библиотеки для работы Human Insight Bank
    ['imblearn', 'transformers', 'category_encoders', 'optuna'],
    #planning_interval=4
)

In [29]:
agent_MLE.prompt_templates["system_prompt"] = MLE_AGENT_PROMPT2

In [30]:
task = '''Your task is to train a machine learning model and measure the metric on a test set.
You are given 'train.csv' (training data) and 'test.csv' (testing data).

You can train any models from libraries and frameworks that are given to you.
You need to predict the target variable 'Churn'.
After training, evaluate the quality on a test set.
Evaluate the trained model on the test dataset using the ROC-AUC metric.
Use the TipsRetriever tool for getting tips on data analysis or model training.
As a final answer, give: 
1. The value of the metric - 'metric_value'
2. Measured metric (for example 'ROC-AUC' or 'RMSE') - 'metric_name'
3. A description of how the data was processed and the model was trained - 'description'
'''

In [31]:
task_2 = '''Your task is to train a machine learning model and measure the metric on a test set.
You are given 'train_flat.csv' (training data) and 'test_flat.csv' (testing data).

You can train any models from libraries and frameworks that are given to you.
You need to predict the target variable 'price'.
After training, evaluate the quality on a test set.
Evaluate the trained model on the test dataset using the RMSE metric.
Use the TipsRetriever tool for getting tips on data analysis or model training.
As a final answer, give: 
1. The value of the metric - 'metric_value'
2. Measured metric (for example 'ROC-AUC' or 'RMSE') - 'metric_name'
3. A description of how the data was processed and the model was trained - 'description'
'''

In [32]:
analyst_result = agent_MLE.run(task)

🔍 RAG Query: Data analysis and model training tips
📚 Retrieved 3 documents


In [33]:
print(analyst_result['metric_name'])
print(analyst_result['metric_value'])
print(analyst_result['description'])

ROC-AUC
0.7489671478347436
Trained a Random Forest classifier after handling missing values and one-hot encoding categorical features. Evaluated the model using ROC-AUC metric on the test set.
