### Imports and installs

In [None]:
from collections import defaultdict
import re
from tqdm import tqdm
import copy
import numpy as np
import pandas as pd
from nltk import sent_tokenize
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [4]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

### Read the data

In [6]:
aspects = defaultdict(list)
with open(f'data/train_aspects.txt', encoding='utf-8') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    keys = ('mention', 'sentiment')
    aspects[line[0]].append({'mention': line[2], 'start': line[3], 'sentiment': line[5]})

In [7]:
reviews = {}
with open(f'data/train_reviews.txt') as f:
  for line in f:
    line = line.rstrip('\r\n').split('\t')
    reviews[line[0]] = line[1]

### Preprocessing

In [8]:
def custom_tokenize(text):
    sentences = re.split(r'(?<=\.|\!|\?|\))\s+|[.!?)](?=[А-ЯA-Z])', text)
    return sentences

In [9]:
label2id = {'neutral': 0, 'negative': 1, 'positive': 2, 'both': 3}
id2label = {0: 'neutral', 1: 'negative', 2: 'positive', 3: 'both'}

In [10]:
def generate_pairs(reviews, aspects, label2id):
    all_pairs = []
    for review_id in reviews.keys():
        review_text = reviews[review_id]
        sentences = custom_tokenize(review_text)
        end_of_sentence = 0
        start_of_sentence = 0
        cur_aspects =  copy.copy(aspects[review_id])
        for ind, sentence in enumerate(sentences):
            num_of_aspects = 0
            start_of_sentence = review_text.index(sentence)
            end_of_sentence = start_of_sentence + len(sentence)
            for aspect in cur_aspects:
                abstract = [sentence]
                if ind != len(sentences) - 1:
                    abstract.append(sentences[ind+1])
                abstract = ' '.join(abstract)
                if int(aspect['start']) >= start_of_sentence and int(aspect['start']) < end_of_sentence:
                    all_pairs.append((aspect['mention'], abstract, label2id[aspect['sentiment']]))
                    num_of_aspects += 1
                else:
                    break
            cur_aspects = cur_aspects[num_of_aspects:]
    return all_pairs

In [11]:
all_pairs = generate_pairs(reviews, aspects, label2id)

In [12]:
df = pd.DataFrame({'aspect': [pair[0] for pair in all_pairs],
                   'text': [pair[1] for pair in all_pairs],
                   'labels': [pair[2] for pair in all_pairs]})

In [13]:
def one_hot_labels(label):
    one_hot_label = [0.] * 3
    if label == 3:
        one_hot_label[1] = 1.
        one_hot_label[2] = 1.
    else:
        one_hot_label[label] = 1.
    return one_hot_label

In [14]:
df['labels'] = df['labels'].apply(one_hot_labels)

In [15]:
train_size = round(0.9 * len(all_pairs))
train_df = df[:train_size]
val_df = df[train_size:]

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [16]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/mdeberta-v3-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



In [17]:
def tokenize(sample):
    tokenized = tokenizer(sample['aspect'], sample['text'])
    return tokenized

In [18]:
tok_train_dataset = train_dataset.map(tokenize, remove_columns=['aspect', 'text'])
tok_val_dataset = val_dataset.map(tokenize, remove_columns=['aspect', 'text'])

Map:   0%|          | 0/4287 [00:00<?, ? examples/s]

Map:   0%|          | 0/476 [00:00<?, ? examples/s]

### Train

In [19]:
model = AutoModelForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base", num_labels=3, problem_type = "multi_label_classification")

pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

In [21]:
def sigmoid(x):
   return 1/(1 + np.exp(-x))

In [22]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int)
    accuracy = accuracy_score(predictions, labels)
    f1_mic = f1_score(predictions, labels, average='micro')
    f1_mac = f1_score(predictions, labels, average='macro')
    return {'accuracy': accuracy, 'f1_micro': f1_mic, 'f1_macro': f1_mac}

In [23]:
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = 5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    weight_decay = 0.02,
    learning_rate = 3e-5,
    warmup_steps = 100,
    report_to=None,
    evaluation_strategy ='epoch',
    save_strategy = 'no')

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [24]:
trainer = Trainer(model=model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = tok_train_dataset,
                  eval_dataset = tok_val_dataset,
                  compute_metrics = compute_metrics)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Micro,F1 Macro
1,No log,0.341939,0.733193,0.766949,0.679136
2,0.358100,0.292478,0.787815,0.818848,0.709769
3,0.358100,0.294971,0.808824,0.83195,0.751475
4,0.159800,0.374193,0.794118,0.812435,0.703796
5,0.159800,0.381474,0.798319,0.823529,0.733888


TrainOutput(global_step=1340, training_loss=0.215900693010928, metrics={'train_runtime': 801.1076, 'train_samples_per_second': 26.757, 'train_steps_per_second': 1.673, 'total_flos': 1535345201785590.0, 'train_loss': 0.215900693010928, 'epoch': 5.0})

In [26]:
model.save_pretrained('sent_class_deberta_model', from_pt=True)