In [1]:
import pandas as pd
import pathlib, os
import torch
from clai.tooling import tool
from clai.modeling.module.model import model
from clai.tooling import io
from clai.processing import Processor

In [2]:
label_list = [
        '8800 СберМегаМаркет. Купить',
        '8800 СберМегаМаркет. Статус доставки и заказа',
        '8800 СберМегаМаркет. Другие вопросы',
        'Соединить с оператором',
        '8800 СберМегаМаркет. Не пришел заказ',
        '8800 СберМегаМаркет. Разводящий вопрос',
        'Да',
        'Прочее',
        '8800 СберМегаМаркет. Качество, комплектация, состав заказа',
        '8800 СберМегаМаркет. Программа лояльности',
        '8800 СберМегаМаркет. Отменить заказ',
        '8800 СберМегаМаркет. Узнать условия',
        '8800 СберМегаМаркет. Обращение',
        '8800 СберМегаМаркет. Изменить заказ',
        '8800 СберМегаМаркет. Изменить личные данные',
        '8800 СберМегаМаркет. Оплата',
        '8800 СберМегаМаркет. Изменить доставку',
        '8800 СберМегаМаркет. Заказ отменили',
        '8800 СберМегаМаркет. Вернуть деньги',
        '8800 СберМегаМаркет. Вернуть товар',
        'Нет',
        '8800 СберМегаМаркет. Получение заказа',
        '8800 СберМегаМаркет. Юридические лица',
        '8800 СберМегаМаркет. Промокод'
    ]

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Devices available: {}".format(device))

Devices available: cpu


In [4]:
lm = model.LanguageModel.load("coi")

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
tokenizer = io.load_tokenizer(pretrained_model_name_or_path="cointegrated/rubert-tiny")

In [6]:
tokenizer

PreTrainedTokenizerFast(name_or_path='cointegrated/rubert-tiny', vocab_size=29564, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [7]:
data_dir = pathlib.Path.home() / "Dataset" / "sber"
max_seq_len = 128
text_column_name = "input_text"
label_column_name = "topic"
metric = "f1_macro"
train_filename = "train.csv"
test_filename = "test.csv"

In [8]:
processor = Processor.load(
    "klass",
    max_seq_len=max_seq_len,
    tokenizer=tokenizer, 
    data_dir=data_dir, 
    train_filename=train_filename,
    test_filename=test_filename,
    dev_split=0.0,
    label_list=label_list,
    text_column_name=text_column_name, 
    label_column_name=label_column_name,
    metric=metric
)

In [9]:
from clai.flowing import flow

In [10]:
data_silo = flow.Flow.load("haski", processor=processor, batch_size=1)

06/24/2022 15:33:19 - INFO - clai.flowing.haski.flow -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
06/24/2022 15:33:19 - INFO - clai.flowing.haski.flow -   LOADING TRAIN DATA
06/24/2022 15:33:19 - INFO - clai.flowing.haski.flow -   Loading train set from: /Users/justatom/Dataset/sber/train.csv 
06/24/2022 15:33:19 - INFO - clai.flowing.base.flow -   Got ya 9 parallel workers to convert 19334 dictionaries to pytorch datasets (chunksize = 430)...


Preprocessing Dataset /Users/justatom/Dataset/sber/train.csv:   0%|          | 0/19334 [00:00<?, ? Dicts/s]

06/24/2022 15:33:23 - INFO - clai.processing.base.processor -   *** Show 1 random examples ***
06/24/2022 15:33:23 - INFO - clai.processing.base.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: None
Clear Text: 
 	text_classification_label: 8800 СберМегаМаркет. Не пришел заказ
 	text: я жду заказ у меня до сих пор нет заказа
Tokenized: 
 	None
Features: 
 	input_ids: [2, 343, 318, 3914, 650, 13361, 331, 17717, 745, 6249, 13276, 10030, 650, 13361, 603, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

06/24/2022 15:33:24 - INFO - clai.flowing.haski.flow -   
06/24/2022 15:33:24 - INFO - clai.flowing.haski.flow -   LOADING DEV DATA
06/24/2022 15:33:24 - INFO - clai.flowing.haski.flow -   No dev set is being loaded
06/24/2022 15:33:24 - INFO - clai.flowing.haski.flow -   
06/24/2022 15:33:24 - INFO - clai.flowing.haski.flow -   LOADING TEST DATA
06/24/2022 15:33:24 - INFO - clai.flowing.haski.flow -   Loading test set from: /Users/justatom/Dataset/sber/test.csv
06/24/2022 15:33:24 - INFO - clai.flowing.base.flow -   Got ya 9 parallel workers to convert 4784 dictionaries to pytorch datasets (chunksize = 107)...


Preprocessing Dataset /Users/justatom/Dataset/sber/test.csv:   0%|          | 0/4784 [00:00<?, ? Dicts/s]

06/24/2022 15:33:28 - INFO - clai.processing.base.processor -   *** Show 1 random examples ***
06/24/2022 15:33:28 - INFO - clai.processing.base.processor -   

      .--.        _____                       _      
    .'_\/_'.     / ____|                     | |     
    '. /\ .'    | (___   __ _ _ __ ___  _ __ | | ___ 
      "||"       \___ \ / _` | '_ ` _ \| '_ \| |/ _ \ 
       || /\     ____) | (_| | | | | | | |_) | |  __/
    /\ ||//\)   |_____/ \__,_|_| |_| |_| .__/|_|\___|
   (/\||/                             |_|           
______\||/___________________________________________                     

ID: None
Clear Text: 
 	text_classification_label: Соединить с оператором
 	text: да соедини специалистом
Tokenized: 
 	None
Features: 
 	input_ids: [2, 791, 1154, 24274, 958, 329, 26897, 761, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


06/24/2022 15:33:29 - INFO - clai.flowing.base.flow -   
06/24/2022 15:33:29 - INFO - clai.flowing.base.flow -   DATASETS SUMMARY
06/24/2022 15:33:29 - INFO - clai.flowing.base.flow -   Examples in train: 19334
06/24/2022 15:33:29 - INFO - clai.flowing.base.flow -   Examples in dev  : 0
06/24/2022 15:33:29 - INFO - clai.flowing.base.flow -   Examples in test : 4784
06/24/2022 15:33:29 - INFO - clai.flowing.base.flow -   
06/24/2022 15:33:29 - INFO - clai.flowing.base.flow -   Longest sequence length observed after clipping:     74
06/24/2022 15:33:29 - INFO - clai.flowing.base.flow -   Average sequence length after clipping: 11.315247750077583
06/24/2022 15:33:29 - INFO - clai.flowing.base.flow -   Proportion clipped:      0.0


In [11]:
from clai.modeling.module.head import PredictionHead

In [12]:
prediction_head = PredictionHead.load("cls", num_labels=len(label_list))

AttributeError: type object 'PredictionHead' has no attribute 'load'