In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install datasets seqeval --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [3]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [4]:
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from seqeval.metrics import classification_report

In [36]:
reviews = {}
with open('drive/MyDrive/colab/aspect_project/full_train/train_reviews.txt', encoding='utf-8') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    reviews[line[0]] = line[1]

In [37]:
aspects = defaultdict(list)
with open('drive/MyDrive/colab/aspect_project/full_train/train_aspects.txt', encoding='utf-8') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    keys = ('category', 'mention', 'start', 'end', 'sentiment')
    aspects[line[0]].append(dict(zip(keys, line[1:])))

In [38]:
tags = ['O']
categories = ['Whole', 'Service', 'Food', 'Interior', 'Price']
for category in categories:
    tags.append('B-'+category)
    tags.append('I-'+category)
tag2id = {tag: ind for ind, tag in enumerate(tags)}
id2tag = {ind: tag for ind, tag in enumerate(tags)}

In [8]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/mdeberta-v3-base', use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



In [47]:
def tokenize_and_align_labels(reviews, aspects):
    all_input_ids = []
    all_labels = []
    for text_id, text in tqdm(reviews.items()):
        labels = []
        tokenized = tokenizer(text, return_offsets_mapping=True)
        tokens = tokenized.tokens()
        offsets = tokenized.offset_mapping
        aspects_num = 0
        for i in range(len(tokens)):
            if aspects_num >= len(aspects[text_id]):
                break
            shift = 1 if tokens[i].startswith('▁') and tokens[i] != '▁' else 0
            aspect = aspects[text_id][aspects_num]
            if offsets[i][0] + shift < int(aspect['end']) and offsets[i][0] + shift > int(aspect['start']):
                if offsets[i][0] + shift == int(aspect['start']):
                    labels.append(tag2id['B-' + aspect['category']])
                else:
                    labels.append(tag2id['I-' + aspect['category']])
                aspects_num += 1
            else:
                labels.append(tag2id['O'])
        all_input_ids.append(tokenized['input_ids'])
        all_labels.append(labels)
    return all_input_ids, all_labels

In [48]:
all_tokens, all_labels = tokenize_and_align_labels(reviews, aspects)

100%|██████████| 284/284 [00:00<00:00, 603.45it/s]


In [49]:
tokens = tokenizer.convert_ids_to_tokens(all_tokens[0])
labels = all_labels[0]
for token, label in zip(tokens, labels):
    print(token, id2tag[label])

[CLS] O
▁Де O
нь O
▁8- O
го O
▁ O
марта O
▁прош O
ёл O
, O
▁ O
можно O
▁ O
и O
▁ O
итог O
и O
▁под O
вести O
. O
▁Ре O
шил O
▁на O
писать O
▁отзыв O
▁ O
о O
▁ресторан O
е I-Whole
▁в O
▁котор O
ом O
▁отме O
тили O
▁прекрасн O
ый O
▁вес O
ений O
▁праздник O
, O
▁прочита O
л O
▁отзыв O
ы O
▁ O
edik O
077 O
▁ O
и O
▁ O
Rules O
77777 O
и O
▁по O
нял O
▁что O
▁ O
либо O
▁мы O
▁был O
и O
▁вра O
зных O
▁ресторан O
ах I-Whole
, O
▁ O
либо O
▁у O
▁реб O
ят O
▁что O
- O
то O
▁неза O
лади O
лось O
. O
▁Но O
▁ O
т O
еперь O
▁ O
о O
▁ресторан O
е I-Whole
. O
▁Сто O
лик I-Service
▁брон O
ировали O
▁зара O
нее O
▁ O
и O
▁сдела O
ли O
▁так O
▁как O
▁предлож O
ил O
▁ O
администратор O
▁ O
т O
. O
е O
. O
▁сдела O
ли O
▁пред O
вар O
ительный O
▁заказ O
, O
▁ O
когда O
▁прид O
я O
▁уви O
дели O
▁пол O
ностью O
▁за O
полне O
ный O
▁ресторан O
▁по O
няли O
▁что O
▁совет O
▁нам O
▁ O
дали O
▁действ O
ительно O
▁правильн O
ый O
, O
▁в O
▁ресторан O
е O
▁ O
было O
▁человек O
▁70 O
-80 O
, O
▁тут O
▁действ O
ит

In [11]:
train_size = round(0.9 * len(all_tokens))

In [12]:
df = pd.DataFrame({'input_ids': all_tokens, 'labels': all_labels})

In [13]:
train_df = df[:train_size]
val_df = df[train_size:]

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [25]:
model = AutoModelForTokenClassification.from_pretrained("microsoft/mdeberta-v3-base", num_labels=len(tags))

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding='longest')

In [28]:
def count_metrics(p):
    preds, labels = p
    preds_flat = np.argmax(preds, axis=-1).flatten()
    labels_flat = labels.flatten()
    indices = np.where(labels_flat != -100)
    preds_flat = preds_flat[indices]
    labels_flat = labels_flat[indices]

    accuracy = accuracy_score(preds_flat, labels_flat)
    f1_mic = f1_score(preds_flat, labels_flat, average='micro')
    f1_mac = f1_score(preds_flat, labels_flat, average='macro')

    return {'accuracy': accuracy, 'f1_micro': f1_mic, 'f1_macro': f1_mac}

In [29]:
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 7,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    weight_decay = 0.00,
    learning_rate = 5e-5,
    warmup_steps = 0,
    evaluation_strategy ='epoch',
    save_strategy = 'no',
    report_to=None)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [30]:
trainer = Trainer(model=model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = train_dataset,
                  eval_dataset = val_dataset,
                  data_collator=data_collator,
                  compute_metrics = count_metrics)

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,No log,0.294671,0.923149,0.923149,0.57288
2,No log,0.210637,0.936095,0.936095,0.719564
3,No log,0.209533,0.942921,0.942921,0.762264
4,No log,0.247273,0.938096,0.938096,0.757934
5,No log,0.244774,0.939861,0.939861,0.756751
6,No log,0.283264,0.938096,0.938096,0.741959
7,No log,0.271802,0.939626,0.939626,0.752599


              precision    recall  f1-score   support

        Food       0.75      0.88      0.81       477
    Interior       0.74      0.28      0.41       153
       Price       0.00      0.00      0.00        18
     Service       0.81      0.49      0.61       252
       Whole       0.68      0.62      0.65       111

   micro avg       0.75      0.65      0.70      1011
   macro avg       0.60      0.45      0.50      1011
weighted avg       0.74      0.65      0.67      1011



  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        Food       0.77      0.90      0.83       477
    Interior       0.78      0.64      0.70       153
       Price       0.75      0.17      0.27        18
     Service       0.77      0.63      0.70       252
       Whole       0.68      0.75      0.71       111

   micro avg       0.76      0.77      0.76      1011
   macro avg       0.75      0.62      0.64      1011
weighted avg       0.76      0.77      0.76      1011

              precision    recall  f1-score   support

        Food       0.85      0.86      0.86       477
    Interior       0.86      0.56      0.68       153
       Price       0.88      0.39      0.54        18
     Service       0.84      0.66      0.74       252
       Whole       0.69      0.75      0.72       111

   micro avg       0.83      0.75      0.79      1011
   macro avg       0.82      0.64      0.71      1011
weighted avg       0.83      0.75      0.78      1011

              precisio

TrainOutput(global_step=448, training_loss=0.1433779171534947, metrics={'train_runtime': 223.6899, 'train_samples_per_second': 8.011, 'train_steps_per_second': 2.003, 'total_flos': 338843718147480.0, 'train_loss': 0.1433779171534947, 'epoch': 7.0})

In [21]:
model.save_pretrained('aspect_class_bert_model', from_pt=True)