# Setup

In [1]:
import os
import sys
import re

import numpy as np
import pandas as pd

from pprint import pprint

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from fedot_llm.data.zip import unzip_archive
from fedot_llm.fedot_util import run_example
from fedot_llm.language_models.actions import ModelAction
from fedot_llm.language_models.llms import CustomWebLLM
from fedot_llm.data.data import Dataset
import fedot_llm.language_models.prompts

# Загрузка данных

In [2]:
dataset_name = [
    'titanic', 
    'credit-g'
][0]
dataset_path = os.sep.join(['..', 'datasets', dataset_name])

# zip_filename = f"{dataset_path}.zip"
# os.makedirs(dataset_path, exist_ok=True)
# unzip_archive(zip_filename, dataset_path)

In [3]:
dataset = Dataset.load_from_path(dataset_path)
dataset_description = dataset.get_description()
dataset_metadata_description = dataset.get_metadata_description()

print(dataset_description)
print()
print(dataset_metadata_description)

Assume we have a dataset
The dataset contains the following splits:
 
The test_merged split stored in file "test_merged.csv" contains following columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived']. It is described as None
The test split stored in file "test.csv" contains following columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']. It is described as None
The predictions split stored in file "predictions.csv" contains following columns: ['Unnamed: 0', 'Survived']. It is described as None
The train split stored in file "train.csv" contains following columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']. It is described as None
The gender_submission split stored in file "gender_submission.csv" contains following columns: ['PassengerId', 'Survived']. It is described as None

name: Non

In [4]:
import json
with open('../datasets/big_descriptions.json', 'r') as json_file:
    dataset_big_descriptions = json.load(json_file)

In [5]:
dataset_big_descriptions

{'titanic': 'The sinking of the Titanic is one of the most infamous shipwrecks in history.\n\nOn April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.\n\nWhile there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.\n\nIn this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).\n\nIn this competition, you’ll gain access to two similar datasets that include passenger information like name, age, gender, socio-economic class, etc. One dataset is titled train.csv and the other is titled test.csv.\n\nTrain.csv will contain the details of a subset of the passengers on board (

# Выбор модели

In [6]:
model_type = ["8b", "70b"][0]
url = 'http://10.32.2.2:8672/v1/chat/completions'

if model_type == "70b":
    url = 'http://10.32.15.21:6672/generate'

model = CustomWebLLM(url, model=model_type, timeout=10)

# Уточнение данных о датасете

In [7]:
from fedot_llm.language_models import prompts

action = ModelAction(model=model)
# 1: Название датасета и определение тренировочного сплита
task_prompts = {
    "dataset_name": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_name_prompt,
        "context": None,
    },
    "train_split": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.train_split_definition_prompt,
        "context": dataset.get_description(),
    }
}

responses = action.run_model_multicall(
    task_prompts
)
operations = {
    "train_split": lambda x : x.split(".")[0]
}
responses = action.process_model_responses(responses, operations)
pprint(responses)

dataset.name = responses["dataset_name"]
dataset.train_split_name = responses["train_split"] 

{'dataset_name': 'Titanic Survivorship Prediction Dataset',
 'train_split': 'train'}


In [8]:
train = list(filter(lambda split: split.name == dataset.train_split_name, dataset.splits))[0]
train.data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [9]:
column_descriptions = action.generate_all_column_description(split=train, dataset=dataset)
train.set_column_descriptions(column_descriptions)

In [10]:
column_descriptions

{'PassengerId': 'sequential unique identifier for each passenger',
 'Survived': 'a binary value indicating whether a passenger survived (1) or not (0)',
 'Pclass': 'passenger class, with 1 being the first class and 3 being the third class',
 'Name': 'the names of passengers on the titanic, including titles and suffixes',
 'Sex': 'binary classification indicating the gender of a passenger: male or female.',
 'Age': 'numerical values representing ages of passengers, ranging from 0.75 to 74.',
 'SibSp': 'number of siblings and spouses on board the ship',
 'Parch': 'number of parents and/or children accompanied on the voyage',
 'Ticket': 'unique identifiers for passengers and crew, including ticket numbers and cabin information',
 'Fare': 'continuous value representing the fare paid by passengers in british pounds, ranging from £6.24 to £263',
 'Cabin': "a column containing cabin numbers or combinations of numbers, with some values missing (represented by 'nan').",
 'Embarked': "the embark

In [12]:
# 2: Цель всей задачи

task_prompts = {
    "dataset_description": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_description_prompt,
        "context": dataset.get_description(),
    },
    "dataset_goal": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_goal_prompt,
        "context": dataset.get_description(),
    },
}

responses = action.run_model_multicall(
    task_prompts
)
pprint(responses)

dataset.description = responses["dataset_description"]
dataset.goal = responses["dataset_goal"]

{'dataset_description': 'Here is a short description of the dataset:\n'
                        '\n'
                        'The Titanic dataset contains information about 2224 '
                        'passengers who boarded the ill-fated RMS Titanic on '
                        'its maiden voyage in 1912. The dataset includes '
                        'columns for passenger identification, survival '
                        'status, class, name, gender, age, family '
                        'relationships, ticket information, fare paid, cabin '
                        'number, and embarkation port. With a mix of '
                        'categorical and numerical variables, this dataset '
                        'provides insights into the demographics and '
                        'characteristics of the passengers on board, allowing '
                        'for analysis and prediction of who survived the '
                        'tragic event.',
 'dataset_goal': 'Formulate a 

In [13]:
# 2: Категориальные столбцы, таргет-столбец, тип задачи

task_prompts = {
    "categorical_columns": {
        "system": dataset_description,
        "task": prompts.categorical_definition_prompt,
        "context": prompts.categorical_definition_context,
    },
    "target_column": {
        "system": dataset_description,
        "task": prompts.target_definition_prompt,
        "context": None,
    },
    "task_type": {
        "system": dataset_description,
        "task": prompts.task_definition_prompt,
        "context": None,
    }
}

#Выбор модели

model_type = ["8b", "70b"][0]
url = 'http://10.32.2.2:8672/v1/chat/completions'

if model_type == "70b":
    url = 'http://10.32.15.21:6672/generate'

responses = action.run_model_multicall(
    task_prompts
)

pattern = r'[\'\"“”‘’`´]'
operations = {
    "categorical_columns": lambda x : x.split("\n"),
    "target_column" :  lambda x : re.sub(pattern, '', x),
    "task_type": lambda x : re.sub(pattern, '', x.lower())
}
responses = action.process_model_responses(responses, operations)
pprint(responses)

{'categorical_columns': ['Pclass', 'Sex', 'Embarked', 'SibSp', 'Parch'],
 'target_column': 'Survived',
 'task_type': 'classification'}


# Запуск фреймворка

In [14]:
print(dataset.splits[3].name)
print(dataset.splits[0].name)

train
test_merged


In [15]:
dataset.splits[3].data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [16]:
prediction = run_example(train_df=dataset.splits[3].data, test_df=dataset.splits[0].data, problem=responses['task_type'], target=responses['target_column'])

2024-07-24 15:53:46,959 - Topological features operation requires extra dependencies for time series forecasting, which are not installed. It can infuence the performance. Please install it by 'pip install fedot[extra]'


Generations:   0%|          | 0/10000 [00:00<?, ?gen/s]

2024-07-24 15:53:54,219 - Topological features operation requires extra dependencies for time series forecasting, which are not installed. It can infuence the performance. Please install it by 'pip install fedot[extra]'
2024-07-24 15:53:54,219 - Topological features operation requires extra dependencies for time series forecasting, which are not installed. It can infuence the performance. Please install it by 'pip install fedot[extra]'
2024-07-24 15:53:54,219 - Topological features operation requires extra dependencies for time series forecasting, which are not installed. It can infuence the performance. Please install it by 'pip install fedot[extra]'


Generations:   0%|          | 0/10000 [00:07<?, ?gen/s]


KeyboardInterrupt: 

In [14]:
prediction[:5]

array([[0],
       [0],
       [0],
       [0],
       [1]])

In [15]:
result_df = pd.DataFrame(prediction, columns=[dataset_metadata["target_column"]])

result_df.to_csv(f"{dataset_path}/predictions.csv")

NameError: name 'dataset_metadata' is not defined