# Setup

In [1]:
import os
import re
import sys

import pandas as pd
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from fedot_llm.language_models.llms import HuggingFaceLLM
from fedot_llm.language_models.actions import ModelAction
from fedot_llm.data.data import Dataset
from fedot_llm.fedot_util import run_example
from pprint import pprint

# Загрузка данных

In [2]:
dataset_name = [
    'titanic', 
    'credit-g'
][0]
dataset_path = os.sep.join(['..', 'datasets', dataset_name])
dataset = Dataset.load_from_path(dataset_path)
dataset_description = dataset.description
dataset_metadata_description = dataset.metadata_description

print(dataset_description)
print()
print(dataset_metadata_description)

None

name: None 
description: None 
goal: None 
train_split_name: None 
splits:

name: test_merged 
path: ../datasets/titanic/test_merged.csv 
description: None

name: test 
path: ../datasets/titanic/test.csv 
description: None

name: predictions 
path: ../datasets/titanic/predictions.csv 
description: None

name: train 
path: ../datasets/titanic/train.csv 
description: None

name: gender_submission 
path: ../datasets/titanic/gender_submission.csv 
description: None


In [3]:
import json
with open('../datasets/big_descriptions.json', 'r') as json_file:
    dataset_big_descriptions = json.load(json_file)
dataset_big_descriptions

{'titanic': 'The sinking of the Titanic is one of the most infamous shipwrecks in history.\n\nOn April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.\n\nWhile there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.\n\nIn this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).\n\nIn this competition, you’ll gain access to two similar datasets that include passenger information like name, age, gender, socio-economic class, etc. One dataset is titled train.csv and the other is titled test.csv.\n\nTrain.csv will contain the details of a subset of the passengers on board (

In [None]:
model = HuggingFaceLLM(model_id="microsoft/Phi-3-mini-4k-instruct", max_new_tokens=500)
action = ModelAction(model)

# Уточнение данных о датасете

In [None]:
from fedot_llm.language_models import prompts

action = ModelAction(model=model)
# 1: Название датасета и определение тренировочного сплита
task_prompts = {
    "dataset_name": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_name_prompt,
        "context": None,
    },
    "train_split": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.train_split_definition_prompt,
        "context": dataset.description,
    }
}

responses = action.run_model_multicall(
    task_prompts
)
operations = {
    "train_split": lambda x : x.split(".")[0]
}
responses = action.process_model_responses(responses, operations)
pprint(responses)

dataset.name = responses["dataset_name"]
dataset.train_split_name = responses["train_split"] 

In [None]:
train = list(filter(lambda split: split.name == dataset.train_split_name, dataset.splits))[0]
column_descriptions = action.generate_all_column_description(split=train, dataset=dataset)
train.set_column_descriptions(column_descriptions)
column_descriptions

In [None]:
# 2: Категориальные столбцы, таргет-столбец, тип задачи

task_prompts = {
    "target_column": {
        "system": dataset.description,
        "task": prompts.target_definition_prompt,
        "context": None,
    },
    "task_type": {
        "system": dataset.description,
        "task": prompts.task_definition_prompt,
        "context": None,
    }
}

responses = action.run_model_multicall(
    task_prompts
)

pattern = r'[\'\"“”‘’`´]'
operations = {
    "target_column" :  lambda x : re.sub(pattern, '', x),
    "task_type": lambda x : re.sub(pattern, '', x.lower())
}
responses = action.process_model_responses(responses, operations)
responses['categorical_columns'] = action.get_categorical_features(split=train, dataset=dataset)
pprint(responses)

In [None]:
prediction = run_example(train_df=dataset.splits[3].data, test_df=dataset.splits[0].data, problem=responses['task_type'], target=responses['target_column'])