# Fitting Description

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install omegaconf

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.0 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 48.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 29.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 50.1 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [1]:
cd /content/drive/MyDrive/digital_breakthrough/task_3

/content/drive/MyDrive/digital_breakthrough/task_3


## Load Data

In [2]:
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
sys.path.append('.')
from definitions import ROOT_DIR

In [3]:
DATA_PATH = ROOT_DIR / 'data'
TRAIN_IMAGES = DATA_PATH / 'images'
DOWNLOADED_TRAIN_IMAGES = DATA_PATH / 'downloaded_images'

In [4]:
train = pd.read_csv(DATA_PATH / 'train.csv')
train_url_only = pd.read_csv(DATA_PATH / 'train_url_only.csv')
train_url_loaded_images = pd.read_csv(DATA_PATH / 'train_loaded_images.csv')
test = pd.read_csv(DATA_PATH / 'test.csv')
sample_submission = pd.read_csv(DATA_PATH / 'sample_submission.csv')

In [5]:
from os import listdir
train_images = listdir(TRAIN_IMAGES)
guid_train_images = [f.split('.')[0] for f in train_images]

In [6]:
dummy = test[test.guid.isin(guid_train_images)]
test_only_description = test[~(test.guid.isin(guid_train_images))]
test_only_images = dummy[dummy.description.isna()]
test_images_and_description = dummy[~(dummy.description.isna())]
print('without image or description:', len(test_only_description[test_only_description.description.isna()]))
print('only description:', test_only_description.shape[0])
print('only images:', test_only_images.shape[0])
print('images and description:', test_images_and_description.shape[0])

without image or description: 0
only description: 78
only images: 547
images and description: 598


 ## FitDesc

### Models

In [7]:
import tensorflow as tf
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Dense, GlobalAveragePooling1D, Input
from tensorflow.keras.models import Model
from transformers import TFDistilBertModel

In [20]:
import keras.backend as K
def f1_score(y_true, y_pred):
    # Count positive samples.
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
    precision = c1 / c2
    # How many relevant items are selected?
    recall = c1 / c3
    # Calculate f1_score
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [21]:
def distilbert_model(input_shape,
                     transformer_model,
                     output_shape=15,
                     output_activation='softmax',
                     optimizer='Adam',
                     optimizer_params={'lr': 1e-5},
                     loss='categorical_crossentropy',
                     metrics=None):

    input_ids = Input((input_shape,), dtype=tf.int32)
    input_mask = Input((input_shape,), dtype=tf.int32)

    transformer_encoder = TFDistilBertModel.from_pretrained(
        transformer_model,
        from_pt=True,
        output_hidden_states=True
    )
    outputs = transformer_encoder.distilbert(input_ids,
                                             attention_mask=input_mask)

    x = outputs[0]
    x = GlobalAveragePooling1D()(x)
    output = Dense(output_shape,
                   activation=output_activation)(x)

    model = Model(inputs=[input_ids, input_mask],
                  outputs=output)
    model.compile(loss=loss,
                  metrics=metrics,
                  optimizer=getattr(optimizers, optimizer)(**optimizer_params)
                  )

    return model

### Preprocess

In [9]:
"""
Preprocessing.
"""

import numpy as np
from tqdm import tqdm
from transformers import PreTrainedTokenizerFast


def preprocess(texts, tokenizer_path, max_len=32):

    input_ids, input_masks = [], []

    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
    tokenizer.mask_token = '[MASK]'
    tokenizer.pad_token = "[PAD]"
    tokenizer.sep_token = "[SEP]"
    tokenizer.cls_token = "[CLS]"
    tokenizer.unk_token = "[UNK]"

    for text in tqdm(texts):
        encoded = tokenizer.encode_plus(text,
                                        max_length=max_len,
                                        pad_to_max_length=True,
                                        truncation=True)
        input_ids.append(encoded['input_ids'])
        input_masks.append(encoded['attention_mask'])

    return [np.array(input_ids), np.array(input_masks)]


### Prepared Data

#### Get Full Clean Data

In [10]:
train_url_only['typology'] = train_url_only.typology.replace(
    {'предметы прикладного искусства, быта и этнографии ': 'предметы прикладного искусства, быта и этнографии'}
    )

In [11]:
train_labels = train.typology.unique()
typology_to_label = dict(zip(sorted(train_labels), range(len(train_labels))))

In [12]:
train_labels_url = train_url_only[~(train_url_only.typology.isna())].typology.unique()
typology_to_label_url = dict(zip(sorted(train_labels_url), range(len(train_labels_url))))

In [13]:
train_url_only_train_labels = train_url_only[train_url_only.typology.isin(typology_to_label.keys())]
print(len(train_url_only_train_labels))

197117


In [14]:
train['url'] = 1
full_train = pd.concat((train, train_url_only_train_labels), axis=0)
full_train = full_train[~(full_train.typology.isna())]
full_train = full_train[~(full_train.description.isna())]
full_train.shape

(201852, 4)

#### Prepare labels

##### Save Item Names

In [None]:
item_names = train.description.drop_duplicates()
item_names = item_names.map(lambda x: x + '\n')

In [None]:
with open('./data/item_name.txt', 'w') as f:
    f.writelines(item_names.tolist())

##### Save Full Train

In [None]:
save_train = full_train.drop_duplicates('description')
save_train = save_train[save_train.description != '']
print(save_train.shape)

(157449, 4)


In [None]:
save_train.to_csv('./data/train_data.csv', index=False)

##### Prepare list of categories

In [None]:
categories = sorted(save_train.typology.unique())
categories = pd.Series(categories, name='category')

In [None]:
categories.to_csv('./data/categories.csv', index=False)

### Train Tokenizers

In [15]:
import os
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer

In [None]:
tokenizers = {
    1: {
        'tokenizer': Tokenizer(WordPiece(unk_token="[UNK]")),
        'pre_tokenizer': Sequence([Whitespace(), Digits()]),
        'normalizer': Lowercase(),
        'trainer': WordPieceTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
            vocab_size=70000),
        'name': 'wordpiece_70k.json'
        },
    2: {
        'tokenizer': Tokenizer(BPE(unk_token="[UNK]")),
        'pre_tokenizer': Sequence([Whitespace(), Digits()]),
        'normalizer': Lowercase(),
        'trainer': BpeTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
            vocab_size=60000),
        'name': 'bpe_60k.json'
        },
    3: {
        'tokenizer': Tokenizer(Unigram()),
        'pre_tokenizer': Sequence([Whitespace(), Digits()]),
        'normalizer': Lowercase(),
        'trainer': UnigramTrainer(
            special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
            vocab_size=50000),
        'name': 'unigram_50k.json'
        }
}

In [None]:
class TokenizerFabrica(object):

    def __init__(self, 
                 tokenizer,
                 pre_tokenizer,
                 normalizer,
                 trainer,
                 name):
        self.tokenizer = tokenizer
        self.tokenizer.pre_tokenizer = pre_tokenizer
        self.tokenizer.normalizer = normalizer
        self.trainer = trainer
        self.name = name
        self.fitted = False
    
    def fit(self, item_names):
        self.tokenizer.train_from_iterator(item_names, 
                                           self.trainer)
        self.fitted = True

    def save_model(self, output):
        path = str(output / self.name)
        if self.fitted:
            self.tokenizer.save(path)
            return path
        else:
            raise ValueError('Fit tokenizer before saving')

In [None]:
with open('./data/item_name.txt', 'r') as f:
    items = f.readlines()

In [None]:
MODELS_PATH = ROOT_DIR / 'models'
TOKENIZERS_PATH = MODELS_PATH / 'tokenizers'

In [None]:
for config in tokenizers.values():
    token = TokenizerFabrica(**config)
    token.fit(items)
    path = token.save_model(TOKENIZERS_PATH)

### Train Language Model

In [16]:
import argparse
import os

from omegaconf import OmegaConf
from transformers import (
    DataCollatorForLanguageModeling, 
    DistilBertConfig,
    DistilBertForMaskedLM, 
    LineByLineTextDataset, 
    PreTrainedTokenizerFast, 
    Trainer, 
    TrainingArguments
    )

In [17]:
os.environ['WANDB_DISABLED'] = 'true'

In [18]:
class LanguageModel(object):

    def __init__(self, config):
        self.tokenizer = PreTrainedTokenizerFast(
            tokenizer_file=config.tokenizer_path
            )
        self.tokenizer.mask_token = '[MASK]'
        self.tokenizer.pad_token = "[PAD]"
        self.tokenizer.sep_token = "[SEP]"
        self.tokenizer.cls_token = "[CLS]"
        self.tokenizer.unk_token = "[UNK]"
        self.distilbert_config = DistilBertConfig(
            vocab_size=config.vocab_size,
            n_heads=8, 
            dim=512, 
            hidden_dim=2048
            )
        self.model = DistilBertForMaskedLM(self.distilbert_config)
        self.dataset = LineByLineTextDataset(
            tokenizer=self.tokenizer,
            file_path=config.item_names_path,
            block_size=64
            )
        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=True,
            mlm_probability=config.mlm_probability
            )
        self.training_args = TrainingArguments(
            output_dir=config.output_path,
            overwrite_output_dir=True,
            num_train_epochs=config.num_train_epochs,
            learning_rate=config.learning_rate,
            per_device_train_batch_size=config.batch_size,
            save_steps=300000,
            save_total_limit=1
            )
        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            data_collator=self.data_collator,
            train_dataset=self.dataset,
            eval_dataset=None
            )
        self.fitted = False
        self.config = config
    
    def fit(self):
        self.trainer.train()
        self.fitted = True

    def save_model(self, name='final'):
        if self.fitted:
            self.trainer.save_model(
                os.path.join(self.config.output_path, name)
                )
        else:
            raise ValueError('Fit tokenizer before saving')

In [None]:
for i in range(1, 4):
    config = OmegaConf.load(f'./src/configs/train_lm{i}.yaml')
    print(OmegaConf.to_yaml(config))
    lm = LanguageModel(config)
    lm.fit()
    lm.save_model(name='final')

tokenizer_path: ./models/tokenizers/wordpiece_70k.json
vocab_size: 70000
mlm_probability: 0.3
output_path: ./models/lm_models/distilbert_lm_wordpiece_70k
num_train_epochs: 8
learning_rate: 5.0e-05
batch_size: 32
item_names_path: ./data/item_name.txt



Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 4402
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1104


Step,Training Loss
500,7.5934
1000,6.7432




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./models/lm_models/distilbert_lm_wordpiece_70k/final
Configuration saved in ./models/lm_models/distilbert_lm_wordpiece_70k/final/config.json
Model weights saved in ./models/lm_models/distilbert_lm_wordpiece_70k/final/pytorch_model.bin


tokenizer_path: ./models/tokenizers/bpe_60k.json
vocab_size: 60000
mlm_probability: 0.2
output_path: ./models/lm_models/distilbert_lm_bpe_60k
num_train_epochs: 10
learning_rate: 5.0e-05
batch_size: 32
item_names_path: ./data/item_name.txt



Creating features from dataset file at ./data/item_name.txt
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 4402
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 1380


Step,Training Loss
500,7.6318
1000,6.785




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./models/lm_models/distilbert_lm_bpe_60k/final
Configuration saved in ./models/lm_models/distilbert_lm_bpe_60k/final/config.json
Model weights saved in ./models/lm_models/distilbert_lm_bpe_60k/final/pytorch_model.bin


tokenizer_path: ./models/tokenizers/unigram_50k.json
vocab_size: 50000
mlm_probability: 0.2
output_path: ./models/lm_models/distilbert_lm_unigram_50k
num_train_epochs: 7
learning_rate: 5.0e-05
batch_size: 32
item_names_path: ./data/item_name.txt



Creating features from dataset file at ./data/item_name.txt
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
***** Running training *****
  Num examples = 4402
  Num Epochs = 7
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 966


Step,Training Loss
500,6.8944




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./models/lm_models/distilbert_lm_unigram_50k/final
Configuration saved in ./models/lm_models/distilbert_lm_unigram_50k/final/config.json
Model weights saved in ./models/lm_models/distilbert_lm_unigram_50k/final/pytorch_model.bin


### Train All Data

In [42]:
import argparse
import random
import torch
import pandas as pd
import tensorflow as tf
from omegaconf import OmegaConf

In [34]:
tf.config.experimental.set_memory_growth(
    device=tf.config.experimental.get_visible_devices('GPU')[0],
    enable=True)

In [35]:
data = pd.read_csv('./data/train_data.csv')

In [44]:
seed = 42
np.random.seed(seed)
random.seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [57]:
X = preprocess(data.description, **config.preprocess)
y = pd.get_dummies(data.typology)

100%|██████████| 157449/157449 [00:25<00:00, 6220.54it/s]


In [70]:
for i in range(1, 2):
    config = OmegaConf.load(f'./src/configs/train{i}.yaml')
    print(OmegaConf.to_yaml(config))
    model = distilbert_model(**config.model, metrics=[f1_score])
    print(model.summary())
    model.fit(X, y, verbose=1, validation_split=0.3, **config.train)
    model.save('./models/model/distilbert_wordpiece_70k_3')

output_path: ./models/model/distilbert_wordpiece_70k_2
preprocess:
  tokenizer_path: ./models/tokenizers/wordpiece_70k.json
  max_len: 40
model:
  input_shape: 40
  transformer_model: ./models/lm_models/distilbert_lm_wordpiece_70k/final/
train:
  epochs: 15
  batch_size: 64



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.
  "The `lr` argument is deprecated, use `learning

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_9 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_10 (InputLayer)           [(None, 40)]         0                                            
__________________________________________________________________________________________________
distilbert (TFDistilBertMainLay TFBaseModelOutput(la 55017472    input_9[0][0]                    
                                                                 input_10[0][0]                   
__________________________________________________________________________________________________
global_average_pooling1d_4 (Glo (None, 512)          0           distilbert[0][7]           























INFO:tensorflow:Assets written to: ./models/model/distilbert_wordpiece_70k_3/assets


INFO:tensorflow:Assets written to: ./models/model/distilbert_wordpiece_70k_3/assets


### Prediction

In [None]:
from tensorflow.keras.models import load_model

In [None]:
test_ = test[~(test.description.isna())]
categories = pd.read_csv('./data/categories.csv')['category'].tolist()

In [None]:
tokenizers = [str(TOKENIZERS_PATH / name) for name in ['wordpiece_70k.json', 'bpe_60k.json']]
models = ['./models/model/distilbert_wordpiece_70k', './models/model/distilbert_bpe_60k']

In [None]:
probs = []
item_name = test_.description
for model, tokenizer in zip(models, tokenizers):
    input = preprocess(item_name, tokenizer_path=tokenizer)
    mdl = load_model(model)
    proba = mdl.predict(input, batch_size=256, verbose=True)
    probs.append(proba)

100%|██████████| 676/676 [00:00<00:00, 8638.06it/s]
Model config DistilBertConfig {
  "_name_or_path": "./models/lm_models/distilbert_lm_wordpiece_70k/final/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 512,
  "dropout": 0.1,
  "hidden_dim": 2048,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 8,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "torch_dtype": "float32",
  "transformers_version": "4.9.1",
  "vocab_size": 70000
}





100%|██████████| 676/676 [00:00<00:00, 7958.38it/s]
Model config DistilBertConfig {
  "_name_or_path": "./models/lm_models/distilbert_lm_bpe_60k/final/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 512,
  "dropout": 0.1,
  "hidden_dim": 2048,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 8,
  "n_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "torch_dtype": "float32",
  "transformers_version": "4.9.1",
  "vocab_size": 60000
}





In [None]:
pb = probs[0] + probs[1]
df_pb = pd.DataFrame(pb, columns=categories)

In [None]:
pred = df_pb.idxmax(axis=1)

In [None]:
len(test) - len(test_)

547

In [None]:
sample_submission['typology'] = 'прочие'
sample_submission.loc[test_.index, 'typology'] = pred.values

In [None]:
sam = pd.read_csv('./sub/distilbert.csv')

In [None]:
sample_submission.to_csv('./sub/distilbert.csv', index=False)