# Setup

In [11]:
import os
import re

import numpy as np
import pandas as pd

from pprint import pprint

from zip import unzip_archive
from fedot_util import run_example
from llm_util import run_model_multicall, process_model_responses
from web_api import WebAssistant
from data import Dataset
import prompts

# Загрузка данных

In [2]:
dataset_name = [
    'titanic', 
    'credit-g'
][0]
dataset_path = os.sep.join(['datasets', dataset_name])

# zip_filename = f"{dataset_path}.zip"
# os.makedirs(dataset_path, exist_ok=True)
# unzip_archive(zip_filename, dataset_path)

In [3]:
dataset = Dataset.load_from_path(dataset_path)
dataset_description = dataset.get_description()
dataset_metadata_description = dataset.get_metadata_description()

print(dataset_description)
print()
print(dataset_metadata_description)

Assume we have a dataset.
The dataset contains the following splits:
 
The gender_submission split stored in file "gender_submission.csv" contains following columns: ['PassengerId', 'Survived']. It is described as None
The test split stored in file "test.csv" contains following columns: ['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']. It is described as None
The train split stored in file "train.csv" contains following columns: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']. It is described as None

name: None 
description: None 
goal: None 
train_split_name: None 
splits:

name: gender_submission 
path: datasets\titanic\gender_submission.csv 
description: None

name: test 
path: datasets\titanic\test.csv 
description: None

name: train 
path: datasets\titanic\train.csv 
description: None


In [4]:
import json
with open('datasets/big_descriptions.json', 'r') as json_file:
    dataset_big_descriptions = json.load(json_file)

In [5]:
dataset_big_descriptions

{'titanic': 'The sinking of the Titanic is one of the most infamous shipwrecks in history.\n\nOn April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.\n\nWhile there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.\n\nIn this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).\n\nIn this competition, you’ll gain access to two similar datasets that include passenger information like name, age, gender, socio-economic class, etc. One dataset is titled train.csv and the other is titled test.csv.\n\nTrain.csv will contain the details of a subset of the passengers on board (

# Выбор модели

In [6]:
model_type = ["8b", "70b"][0]
url = 'http://10.32.2.2:8672/v1/chat/completions'

if model_type == "70b":
    url = 'http://10.32.15.21:6672/generate'

model = WebAssistant(url, model_type)

# Уточнение данных о датасете

In [7]:
# 1: Название датасета и определение тренировочного сплита
task_prompts = {
    "dataset_name": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_name_prompt,
        "context": None,
    },
    "train_split": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.train_split_definition_prompt,
        "context": dataset.get_description(),
    }
}

responses = run_model_multicall(
    model, task_prompts
)
operations = {
    "train_split": lambda x : x.split(".")[0]
}
responses = process_model_responses(responses, operations)
pprint(responses)

dataset.name = responses["dataset_name"]
dataset.train_split_name = responses["train_split"] 

{'dataset_name': 'Titanic Survival Dataset', 'train_split': 'train'}


In [8]:
# 2: Цель всей задачи

task_prompts = {
    "dataset_description": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_description_prompt,
        "context": dataset.get_description(),
    },
    "dataset_goal": {
        "system": dataset_big_descriptions[dataset_name],
        "task": prompts.dataset_goal_prompt,
        "context": dataset.get_description(),
    },
}

responses = run_model_multicall(
    model, task_prompts
)
pprint(responses)

dataset.description = responses["dataset_description"]
dataset.goal = responses["dataset_goal"]

{'dataset_description': 'Here is a short description of the Titanic Survival '
                        'Dataset:\n'
                        '\n'
                        'The Titanic Survival Dataset contains passenger '
                        "information from the RMS Titanic's ill-fated maiden "
                        'voyage in 1912. The dataset consists of three splits: '
                        'gender_submission, test, and train. The '
                        'gender_submission split contains only two columns: '
                        'PassengerId and Survived. The test and train splits '
                        'contain additional columns such as Pclass (social '
                        'class), Name, Sex, Age, SibSp (number of '
                        'siblings/spouses on board), Parch (number of '
                        'parents/children on board), Ticket, Fare, Cabin, and '
                        'Embarked (port of embarkation). The goal is to use '
                     

In [21]:
# 2: Категориальные столбцы, таргет-столбец, тип задачи

task_prompts = {
    "categorical_columns": {
        "system": dataset_description,
        "task": prompts.categorical_definition_prompt,
        "context": prompts.categorical_definition_context,
    },
    "target_column": {
        "system": dataset_description,
        "task": prompts.target_definition_prompt,
        "context": None,
    },
    "task_type": {
        "system": dataset_description,
        "task": prompts.task_definition_prompt,
        "context": None,
    }
}

#Выбор модели

model_type = ["8b", "70b"][0]
url = 'http://10.32.2.2:8672/v1/chat/completions'

if model_type == "70b":
    url = 'http://10.32.15.21:6672/generate'

model = WebAssistant(url, model_type)
responses = run_model_multicall(
    model, task_prompts
)

pattern = r'[\'\"“”‘’`´]'
operations = {
    "categorical_columns": lambda x : x.split("\n"),
    "task_type": lambda x : re.sub(pattern, '', x.lower())
}
responses = process_model_responses(responses, operations)
pprint(responses)

{'categorical_columns': ['Pclass', 'Sex', 'Embarked'],
 'target_column': "'Survived'",
 'task_type': 'classification'}


# Запуск фреймворка

In [4]:

if dataset_path == 'titanic':
    test_df = dataset_metadata["splits"]["test_X"].merge(dataset_metadata["splits"]["test_y"],
                                                         on='PassengerId', how='inner')
else:
    test_df = dataset_metadata["splits"]["test"]

train_df = dataset_metadata["splits"]["train"]

prediction = run_example(train_df = train_df, test_df = test_df,
                          dataset_metadata = dataset_metadata)

Generations:   0%|          | 0/10000 [02:16<?, ?gen/s]


{'roc_auc': 0.941, 'accuracy': 0.828}


In [5]:
prediction[:5]

array([[0],
       [0],
       [0],
       [0],
       [1]], dtype=int64)

In [None]:
result_df = pd.DataFrame(prediction, columns=[dataset_metadata["target_column"]])

result_df.to_csv(f"{dataset_path}/predictions.csv")