# Setup

In [3]:
import json
import os
import sys

import pandas as pd
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from fedot_llm.language_models.llms import HuggingFaceLLM
from fedot_llm.language_models.actions import ModelAction
from fedot_llm.data.data import Dataset

# Загрузка данных

In [None]:
dataset_name = [
    'titanic', 
    'credit-g'
][0]
dataset_path = os.sep.join(['..', 'datasets', dataset_name])
dataset = Dataset.load_from_path(dataset_path)
dataset_description = dataset.get_description()
dataset_metadata_description = dataset.get_metadata_description()

print(dataset_description)
print()
print(dataset_metadata_description)

In [None]:
import json
with open('../datasets/big_descriptions.json', 'r') as json_file:
    dataset_big_descriptions = json.load(json_file)
dataset_big_descriptions

In [5]:
model = HuggingFaceLLM(model_id="microsoft/Phi-3-mini-4k-instruct")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

# Метаданные датасетов

Titanic

In [None]:
dataset_metadata = {
    "name": "titanic passengers survival",
    "description": "passengers survived the Titanic shipwreck",
    "goal": "predict which passengers survived the Titanic shipwreck",
    "split_names": ["train", "test_X", "test_y"],
    "split_paths": {
        "train": 'train.csv',
        "test_X": 'test.csv',
        "test_y": 'gender_submission.csv'
    },
    "split_descriptions":{
        "train": 'train dataset',
        "test_X": 'test dataset without target',
        "test_y": 'test dataset target'
    },
    "train_split_name": "train"
} 

with open('titanic/metadata.json', 'w') as json_file:
    json.dump(dataset_metadata, json_file)

Credit-G

In [27]:
dataset_metadata = {
    "name": "German Credit dataset",
    "description": "classifies people described by a set of attributes as good or bad credit risks",
    "goal": "predict good or bad credit risks of people by attributes",
    "split_names": ["train", "test"],
    "split_paths": {
        "train": 'train.csv',
        "test": 'test.csv',
    },
    "split_descriptions":{
        "train": 'train dataset',
        "test": 'test dataset',
    },
    "train_split_name": "train"
} 

with open('credit-g/metadata.json', 'w') as json_file:
    json.dump(dataset_metadata, json_file)

# Загружаем данные и вызываем модель

In [46]:
dataset_path = ['titanic', 'credit-g'][1]
dataset_metadata = load_dataset_data(dataset_path)
print_dataset_data(dataset_metadata)

name : German Credit dataset
description : classifies people described by a set of attributes as good or bad credit risks
goal : predict good or bad credit risks of people by attributes
split_names : ['train', 'test']
split_paths : {'train': 'train.csv', 'test': 'test.csv'}
split_descriptions : {'train': 'train dataset', 'test': 'test dataset'}
train_split_name : train
splits : dict_keys(['train', 'test'])


In [48]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

if base_model_id == "CohereForAI/c4ai-command-r-plus-4bit":
    terminators = terminators[0]

generation_config = {
    "eos_token_id": terminators,
    "max_new_tokens": 256,
    "do_sample": True,
    "temperature": 0.8,
    "top_p": 0.8,
    "top_k": 10,
}

responses = run_model_multicall(model = model,
                                tokenizer = tokenizer,
                                dataset_metadata = dataset_metadata, 
                                generation_config = generation_config)

responses = process_model_responses(responses)
print(responses)

{'categorical_columns': ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone'], 'target_column': 'class', 'task_type': 'classification'}


In [None]:
save_model_responses(responses, dataset_path)

In [49]:
zip_archive(dataset_path, f"{dataset_path}.zip")

credit-g.zip created successfully.
