# Setup

In [26]:
import torch
import json

import pandas as pd

from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    BitsAndBytesConfig
)

from llm_util import run_model_multicall, process_model_responses, save_model_responses
from dataset_util import zip_archive, load_dataset_data, print_dataset_data

# Загружаем модель

In [5]:
access_token = "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"
base_model_id = [
    "NousResearch/Meta-Llama-3-8B-Instruct",
    "CohereForAI/c4ai-command-r-plus-4bit"
][1]


if base_model_id == "NousResearch/Meta-Llama-3-8B-Instruct":
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True, 
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16, 
        low_cpu_mem_usage=True 
    )
    
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(base_model_id, 
                                                 quantization_config = bnb_config, 
                                                 device_map="auto")

elif base_model_id == "CohereForAI/c4ai-command-r-plus-4bit":
    tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=access_token, cache_dir="/mnt/disk2/.cache/huggingface/hub")
    model = AutoModelForCausalLM.from_pretrained(base_model_id, token=access_token, cache_dir="/mnt/disk2/.cache/huggingface/hub")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

In [25]:
!nvidia-smi

Thu Jul  4 16:45:42 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:00:06.0 Off |                    0 |
| N/A   43C    P0              64W / 300W |  64790MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


# Метаданные датасетов

Titanic

In [None]:
dataset_metadata = {
    "name": "titanic passengers survival",
    "description": "passengers survived the Titanic shipwreck",
    "goal": "predict which passengers survived the Titanic shipwreck",
    "split_names": ["train", "test_X", "test_y"],
    "split_paths": {
        "train": 'train.csv',
        "test_X": 'test.csv',
        "test_y": 'gender_submission.csv'
    },
    "split_descriptions":{
        "train": 'train dataset',
        "test_X": 'test dataset without target',
        "test_y": 'test dataset target'
    },
    "train_split_name": "train"
} 

with open('titanic/metadata.json', 'w') as json_file:
    json.dump(dataset_metadata, json_file)

Credit-G

In [27]:
dataset_metadata = {
    "name": "German Credit dataset",
    "description": "classifies people described by a set of attributes as good or bad credit risks",
    "goal": "predict good or bad credit risks of people by attributes",
    "split_names": ["train", "test"],
    "split_paths": {
        "train": 'train.csv',
        "test": 'test.csv',
    },
    "split_descriptions":{
        "train": 'train dataset',
        "test": 'test dataset',
    },
    "train_split_name": "train"
} 

with open('credit-g/metadata.json', 'w') as json_file:
    json.dump(dataset_metadata, json_file)

# Загружаем данные и вызываем модель

In [46]:
dataset_path = ['titanic', 'credit-g'][1]
dataset_metadata = load_dataset_data(dataset_path)
print_dataset_data(dataset_metadata)

name : German Credit dataset
description : classifies people described by a set of attributes as good or bad credit risks
goal : predict good or bad credit risks of people by attributes
split_names : ['train', 'test']
split_paths : {'train': 'train.csv', 'test': 'test.csv'}
split_descriptions : {'train': 'train dataset', 'test': 'test dataset'}
train_split_name : train
splits : dict_keys(['train', 'test'])


In [48]:
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

if base_model_id == "CohereForAI/c4ai-command-r-plus-4bit":
    terminators = terminators[0]

generation_config = {
    "eos_token_id": terminators,
    "max_new_tokens": 256,
    "do_sample": True,
    "temperature": 0.8,
    "top_p": 0.8,
    "top_k": 10,
}

responses = run_model_multicall(model = model,
                                tokenizer = tokenizer,
                                dataset_metadata = dataset_metadata, 
                                generation_config = generation_config)

responses = process_model_responses(responses)
print(responses)

{'categorical_columns': ['checking_status', 'credit_history', 'purpose', 'savings_status', 'employment', 'personal_status', 'other_parties', 'property_magnitude', 'other_payment_plans', 'housing', 'job', 'own_telephone'], 'target_column': 'class', 'task_type': 'classification'}


In [None]:
save_model_responses(responses, dataset_path)

In [49]:
zip_archive(dataset_path, f"{dataset_path}.zip")

credit-g.zip created successfully.
