In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install datasets seqeval --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [13]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [14]:
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from seqeval.metrics import classification_report

In [15]:
reviews = {}
with open('drive/MyDrive/colab/aspect_project/full_train/train_reviews.txt', encoding='utf-8') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    reviews[line[0]] = line[1]

In [16]:
aspects = defaultdict(list)
with open('drive/MyDrive/colab/aspect_project/full_train/train_aspects.txt', encoding='utf-8') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    keys = ('category', 'mention', 'start', 'end', 'sentiment')
    aspects[line[0]].append(dict(zip(keys, line[1:])))

In [17]:
tags = ['O']
categories = ['Whole', 'Service', 'Food', 'Interior', 'Price']
for category in categories:
    tags.append('B-'+category)
    tags.append('I-'+category)
tag2id = {tag: ind for ind, tag in enumerate(tags)}
id2tag = {ind: tag for ind, tag in enumerate(tags)}

In [18]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/mdeberta-v3-base', use_fast=True)



In [19]:
def tokenize_and_align_labels(reviews, aspects):
    all_input_ids = []
    all_labels = []
    for text_id, text in tqdm(reviews.items()):
        labels = []
        tokenized = tokenizer(text, return_offsets_mapping=True)
        tokens = tokenized.tokens()
        offsets = tokenized['offset_mapping']
        for i in range(len(tokens)):
            add = False
            if tokens[i].startswith('▁') and tokens[i] != '▁':
                shift = 1
            else:
                shift = 0
            for mention in aspects[text_id]:
                if offsets[i][0] + shift == int(mention['start']):
                    labels.append(tag2id['B-'+mention['category']])
                    add = True
                    break
                elif offsets[i][0] + shift < int(mention['end']) and offsets[i][0] + shift > int(mention['start']):
                    labels.append(tag2id['I-'+mention['category']])
                    add = True
                    break
            if not add:
                labels.append(tag2id['O'])
        all_input_ids.append(tokenized['input_ids'])
        all_labels.append(labels)
    return all_input_ids, all_labels

In [20]:
all_tokens, all_labels = tokenize_and_align_labels(reviews, aspects)

100%|██████████| 284/284 [00:02<00:00, 120.47it/s]


In [21]:
train_size = round(0.9 * len(all_tokens))

In [22]:
df = pd.DataFrame({'input_ids': all_tokens, 'labels': all_labels})

In [23]:
train_df = df[:train_size]
val_df = df[train_size:]

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [24]:
model = AutoModelForTokenClassification.from_pretrained("microsoft/mdeberta-v3-base", num_labels=len(tags))

pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding='longest')

In [26]:
def count_metrics(p):
    preds, labels = p
    preds_flat = np.argmax(preds, axis=-1).flatten()
    labels_flat = labels.flatten()
    indices = np.where(labels_flat != -100)
    preds_flat = preds_flat[indices]
    labels_flat = labels_flat[indices]

    accuracy = accuracy_score(preds_flat, labels_flat)
    f1_mic = f1_score(preds_flat, labels_flat, average='micro')
    f1_mac = f1_score(preds_flat, labels_flat, average='macro')

    return {'accuracy': accuracy, 'f1_micro': f1_mic, 'f1_macro': f1_mac}

In [27]:
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 7,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    weight_decay = 0.00,
    learning_rate = 5e-5,
    warmup_steps = 0,
    evaluation_strategy ='epoch',
    save_strategy = 'no',
    report_to=None)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [28]:
trainer = Trainer(model=model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = train_dataset,
                  eval_dataset = val_dataset,
                  data_collator=data_collator,
                  compute_metrics = count_metrics)

In [29]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,No log,0.296575,0.919854,0.919854,0.576918
2,No log,0.213752,0.933977,0.933977,0.753379
3,No log,0.199928,0.943156,0.943156,0.766029
4,No log,0.235186,0.938802,0.938802,0.779718
5,No log,0.236896,0.941156,0.941156,0.782337
6,No log,0.26302,0.937978,0.937978,0.760469
7,No log,0.261478,0.940097,0.940097,0.768791


TrainOutput(global_step=448, training_loss=0.1450563669204712, metrics={'train_runtime': 220.4076, 'train_samples_per_second': 8.13, 'train_steps_per_second': 2.033, 'total_flos': 338843718147480.0, 'train_loss': 0.1450563669204712, 'epoch': 7.0})

In [21]:
model.save_pretrained('aspect_class_bert_model', from_pt=True)