# Setup

In [None]:
import os
import sys
import re

import numpy as np
import pandas as pd

from pprint import pprint

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from fedot_llm.data.zip import unzip_archive
from fedot_llm.fedot_util import run_example
from fedot_llm.language_models.actions import ModelAction
from fedot_llm.language_models.llms import OllamaLLM
from fedot_llm.data.data import Dataset
import fedot_llm.language_models.prompts

: 

# Загрузка данных

In [2]:
dataset_name = [
    'titanic', 
    'credit-g'
][0]
dataset_path = os.sep.join(['..', 'datasets', dataset_name])

# zip_filename = f"{dataset_path}.zip"
# os.makedirs(dataset_path, exist_ok=True)
# unzip_archive(zip_filename, dataset_path)

In [3]:
dataset = Dataset.load_from_path(dataset_path)
dataset_description = dataset.description
dataset_metadata_description = dataset.metadata_description

print(dataset_description)
print(dataset_metadata_description)

None

name: None 
description: None 
goal: None 
train_split_name: None 
splits:

name: test_merged 
path: ../datasets/titanic/test_merged.csv 
description: None

name: test 
path: ../datasets/titanic/test.csv 
description: None

name: predictions 
path: ../datasets/titanic/predictions.csv 
description: None

name: train 
path: ../datasets/titanic/train.csv 
description: None

name: gender_submission 
path: ../datasets/titanic/gender_submission.csv 
description: None


In [4]:
import json
with open('../datasets/big_descriptions.json', 'r') as json_file:
    dataset_big_descriptions = json.load(json_file)

In [5]:
dataset_big_descriptions

{'titanic': 'The sinking of the Titanic is one of the most infamous shipwrecks in history.\n\nOn April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.\n\nWhile there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.\n\nIn this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).\n\nIn this competition, you’ll gain access to two similar datasets that include passenger information like name, age, gender, socio-economic class, etc. One dataset is titled train.csv and the other is titled test.csv.\n\nTrain.csv will contain the details of a subset of the passengers on board (

# Выбор модели

In [6]:
model_type = ["8b", "70b"][0]
url = 'http://10.32.2.2:8672/v1/chat/completions'

if model_type == "70b":
    url = 'http://10.32.15.21:6672/generate'

model = OllamaLLM(model='llama3')

# Уточнение данных о датасете

In [7]:
from fedot_llm.language_models import prompts

action = ModelAction(model=model)
# 1: Название датасета и определение тренировочного сплита
task_prompts = {
    "dataset_name": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_name_prompt,
        "context": None,
    },
    "train_split": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.train_split_definition_prompt,
        "context": dataset.description,
    }
}

responses = action.run_model_multicall(
    task_prompts
)
operations = {
    "train_split": lambda x : x.split(".")[0]
}
responses = action.process_model_responses(responses, operations)
pprint(responses)

dataset.name = responses["dataset_name"]
dataset.train_split_name = responses["train_split"] 

{'dataset_name': 'Titanic Survival Prediction Challenge',
 'train_split': 'train'}


In [8]:
train = list(filter(lambda split: split.name == dataset.train_split_name, dataset.splits))[0]
train.data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [9]:
column_descriptions = action.generate_all_column_description(split=train, dataset=dataset)
train.set_column_descriptions(column_descriptions)

In [10]:
column_descriptions

{'PassengerId': 'Unique identifiers assigned to each passenger',
 'Survived': 'Binary indicator of whether a passenger survived (1) or not (0)',
 'Pclass': 'Passenger class (First, Second, or Third)',
 'Name': 'List of passenger names with titles and surnames.',
 'Sex': "Binary variable indicating the gender of a passenger, with 'male' and 'female' as possible values.",
 'Age': 'The age of passengers, ranging from 0.42 to 74 years old.',
 'SibSp': 'Number of siblings or spouses traveling with the passenger.',
 'Parch': 'Presence of parents or children on the ship',
 'Ticket': "Ticket numbers or alphanumeric identifiers that uniquely identify each passenger's ticket",
 'Fare': 'The fare paid by passengers, ranging from $0 to $211.5.',
 'Cabin': "Cabin numbers or combinations, with some missing values denoted as 'nan'",
 'Embarked': 'Passenger embarkation point, can be Southampton (S), Cherbourg (C) or Queenstown (Q), with some missing values'}

In [11]:
# 2: Цель всей задачи

task_prompts = {
    "dataset_description": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_description_prompt,
        "context": dataset.description,
    },
    "dataset_goal": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_goal_prompt,
        "context": dataset.description,
    },
}

responses = action.run_model_multicall(
    task_prompts
)
pprint(responses)

dataset.description = responses["dataset_description"]
dataset.goal = responses["dataset_goal"]

{'dataset_description': 'The Titanic passenger data set contains information '
                        'about the 2,224 people who boarded the ill-fated RMS '
                        'Titanic during its maiden voyage in April 1912. The '
                        'dataset includes demographics such as age, gender, '
                        'and socio-economic class, as well as other features '
                        'like cabin location and travel companions. With 1,502 '
                        'passengers tragically lost at sea, this data set is a '
                        'unique opportunity to analyze and predict survival '
                        'rates based on various characteristics. By training '
                        'models using the provided train.csv data (which '
                        'includes ground truth information) and testing them '
                        'against the test.csv data (without labels), '
                        'competitors can gain insights into w

In [12]:
# 2: Категориальные столбцы, таргет-столбец, тип задачи

task_prompts = {
    "target_column": {
        "system": dataset.description,
        "task": prompts.target_definition_prompt,
        "context": None,
    },
    "task_type": {
        "system": dataset.description,
        "task": prompts.task_definition_prompt,
        "context": None,
    }
}

#Выбор модели

model_type = ["8b", "70b"][0]
url = 'http://10.32.2.2:8672/v1/chat/completions'

if model_type == "70b":
    url = 'http://10.32.15.21:6672/generate'

responses = action.run_model_multicall(
    task_prompts
)

pattern = r'[\'\"“”‘’`´]'
operations = {
    "target_column" :  lambda x : re.sub(pattern, '', x),
    "task_type": lambda x : re.sub(pattern, '', x.lower())
}
responses = action.process_model_responses(responses, operations)
pprint(responses)

{'target_column': 'Survived', 'task_type': 'classification'}


In [13]:
categorical_columns =  action.get_categorical_features(split=train, dataset=dataset)
pprint(categorical_columns)

['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked']


# Запуск фреймворка

In [14]:
print(dataset.splits[3].name)
print(dataset.splits[0].name)

train
test_merged


In [15]:
dataset.splits[3].data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
prediction = run_example(train_df=dataset.splits[3].data, test_df=dataset.splits[0].data, problem=responses['task_type'], target=responses['target_column'])

In [None]:
prediction[:5]