# Setup

In [1]:
pip install transformers



In [2]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [24]:
from transformers import pipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# 1. Pipelines

[Документация по transformers.pipeline](https://huggingface.co/transformers/main_classes/pipelines.html)

[Model hub](https://huggingface.co/models)

1.1 Среди предобученных моделей найдите модель для перевода текста с русского языка на английский. Протестируйте данную модель на нескольких предложениях, используя `transformers.pipeline`. Выведите результаты работы в следующем виде:

```
sentence1_ru -> sentence1_en
sentence2_ru -> sentence2_en
```


In [4]:
model_name = "facebook/m2m100_418M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

translation_pipeline = pipeline(task="translation", model=model, tokenizer=tokenizer)

tokenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

### ru -> en

In [8]:
sentence = "Привет, как дела?"

src_lang = "ru"
tgt_lang = "en"

translation = translation_pipeline(sentence, src_lang=src_lang, tgt_lang=tgt_lang, max_length=50)

print(f"sentence1_ru: {sentence} -> sentence1_en: {translation[0]['translation_text']}")

sentence1_ru: Привет, как дела? -> sentence1_en: Hello, how is it?


### en -> ru

In [9]:
sentence = "Hello, how is it?"

src_lang = "en"
tgt_lang = "ru"

translation = translation_pipeline(sentence, src_lang=src_lang, tgt_lang=tgt_lang, max_length=50)

print(f"sentence1_en: {sentence} -> sentence1_ru: {translation[0]['translation_text']}")

sentence1_en: Hello, how is it? -> sentence1_ru: Здравствуйте, как это выглядит?


1.2 Среди предобученных моделей найдите модель для поиска ответа в тексте. Протестируйте данную модель на нескольких предложениях, используя `transformers.pipeline`. Выведите на экран результаты в следующем виде:

```
Q: ...
A: ...
Q: ...
A: ...
```

In [10]:
model_name = "distilbert-base-cased-distilled-squad"
qa_pipeline = pipeline(task="question-answering", model=model_name, tokenizer=model_name)

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [16]:
contexts = [
    "The capital of France is Paris.",
    "Mount Everest is the highest mountain in the world.",
]

questions = ["What is the capital of France?", "What is the highest mountain?"]

answers = qa_pipeline(question=questions, context=contexts)

for question, answer in zip(questions, answers):
    print(f"Q: {question}")
    print(f"A: {answer['answer']}")
    print(f"Confidence: {answer['score']:.4f}\n")


Q: What is the capital of France?
A: Paris
Confidence: 0.9863

Q: What is the highest mountain?
A: Mount Everest
Confidence: 0.9472



1.3 Среди предобученных моделей найдите модель для классификации тональности русскоязычного текста (позитивный/негативный или позитивный/негативный/нейтральный). Протестируйте данную модель на нескольких предложениях, используя `transformers.pipeline`. Выведите результаты работы в следующем виде:

```
sentence1 -> class1
sentence2 -> class2
...
```

In [17]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
sentiment_pipeline = pipeline(task="sentiment-analysis", model=model_name, tokenizer=model_name)

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [20]:
sentences = [
    "Это был отличный ресторант!",
    "Не могу поверить, какой ужасный сервис.",
    "В принципе, все было хорошо, вкусно, офицант Алексей странный"
]

results = sentiment_pipeline(sentences)

for sentence, result in zip(sentences, results):
    label = result['label']
    score = result['score']
    print(f"Sentence: {sentence} -> Sentiment: {label} (Score: {score:.4f})\n")


Sentence: Это был отличный ресторант! -> Sentiment: 5 stars (Score: 0.5607)

Sentence: Не могу поверить, какой ужасный сервис. -> Sentiment: 1 star (Score: 0.7184)

Sentence: В принципе, все было хорошо, вкусно, офицант Алексей странный -> Sentiment: 4 stars (Score: 0.3618)



## 2. Токенизаторы и модели

[Auto Classes](https://huggingface.co/transformers/model_doc/auto.html)

[Tokenizer](https://huggingface.co/transformers/main_classes/tokenizer.html?highlight=tokenizer#transformers.PreTrainedTokenizer.__call__)

2.1 Решите задачу 1.2, создав объект токенизатора (`transformers.AutoTokenizer`) и модель (`transformers.AutoModelForQuestionAnswering`).

In [22]:
from transformers import AutoTokenizer
from transformers import AutoModelForQuestionAnswering

In [28]:
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [31]:
contexts = [
    "The capital of France is Paris.",
    "Mount Everest is the highest mountain in the world.",
]
questions = ["What is the capital of France?", "What is the highest mountain?"]

for context, question in zip(contexts, questions):
    inputs = tokenizer(question, context, return_tensors="pt")

    outputs = model(**inputs)
    start_idx = torch.argmax(outputs.start_logits)
    end_idx = torch.argmax(outputs.end_logits)

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_idx:end_idx+1]))

    print(f"Q: {question}")
    print(f"A: {answer}\n")


Q: What is the capital of France?
A: Paris

Q: What is the highest mountain?
A: Mount Everest



2.2 Решите задачу 1.3, создав объект токенизатора (`transformers.AutoTokenizer`) и модель (`transformers.AutoModelForSequenceClassification`).

In [36]:
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast

In [37]:
tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment', return_dict=True)

tokenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [38]:
sentences = [
    "Это лучший фильм, который я когда-либо видел!",
    "Сегодня ужасная погода.",
    "Этот продукт не соответствует моим ожиданиям.",
]

for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors="pt")

    outputs = model(**inputs)
    logits = outputs.logits

    probabilities = torch.nn.functional.softmax(logits, dim=1)

    predicted_class = torch.argmax(probabilities, dim=1).item()

    # Вывод результата
    print(f"Sentence: {sentence}")
    print(f"Predicted Class: {predicted_class}")
    print(f"Class Probabilities: {probabilities.tolist()}\n")


Sentence: Это лучший фильм, который я когда-либо видел!
Predicted Class: 1
Class Probabilities: [[0.018038975074887276, 0.9775922894477844, 0.004368809517472982]]

Sentence: Сегодня ужасная погода.
Predicted Class: 2
Class Probabilities: [[0.18062251806259155, 0.067805714905262, 0.751571774482727]]

Sentence: Этот продукт не соответствует моим ожиданиям.
Predicted Class: 0
Class Probabilities: [[0.6578338146209717, 0.18658271431922913, 0.155583456158638]]



# 3. Fine tuning

3.1 Дообучите классификатор отзывов на основе модели `distilbert-base-uncased`.

Датасет: https://yadi.sk/d/mRXgc2aJSCncdw

* считайте данные, разбейте на обучающее и тестовое множество;
* создайте токенизатор `AutoTokenizer` для модели `distilbert-base-uncased` и преобразуйте с его помощью текстовые данные. Не забудьте выровнять длину всех последовательностей при помощи параметра `padding`;
* опишите класс `ReviewDataset`:
  * в данном случае удобнее, чтобы метод `__getitem__` возвращал словарь, а не кортеж (см. класс `MyDataset` ниже). Этот словарь должен содержать все данные, полученные после работы токенизатора плюс по ключу `label` должен находиться правильный ответ;
* создайте модель `AutoModelForSequenceClassification` с предобученными весами на основе `distilbert-base-uncased`;
  * при создании модели укажите параметр `num_labels=2`
* дообучите модель:
  * удобная особенность моделей из `transformers`: в метод `__call__` модели можно передать параметр `labels`, содержащий правильные ответы для обучения; тогда в словаре, который вернет метод `__call__` будет ключ `loss`, содержащий тензор со значением функции потерь, у которого можно вызвать метод `backward` и т.д. Таким образом, в данном случае функцию потерь объявлять не нужно;
  * для обучения используйте оптимизатор `transformers.AdamW` вместо `torch.optim.Adam`;
* измерьте значение accuracy на тестовом множестве.

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from transformers import AutoModelForSequenceClassification, AdamW, AutoTokenizer, get_linear_schedule_with_warmup

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import build_vocab_from_iterator
import torch.nn.functional as F


from tqdm.auto import tqdm

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


### Model and tokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
df1 = pd.read_csv('/content/drive/MyDrive/DL_FU/positive_reviews.txt', delimiter='\t',  names =  ['review', 'rating'] )
df1['rating'] = 0
df2 = pd.read_csv('/content/drive/MyDrive/DL_FU/negative_reviews.txt', delimiter='\t', names =  ['review', "rating"] )
df2['rating'] = 1


df_reviews = pd.concat([df1, df2], axis=0, ignore_index=True)
df_reviews

Unnamed: 0,review,rating
0,"simplistic , silly and tedious .",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
10657,both exuberantly romantic and serenely melanch...,1
10658,mazel tov to a film about a family's joyous li...,1
10659,standing in the shadows of motown is the best ...,1
10660,it's nice to see piscopo again after all these...,1


In [5]:
df_reviews['rating'].value_counts()

0    5331
1    5331
Name: rating, dtype: int64

### Train and Test

In [6]:
df_train, df_test = train_test_split(df_reviews, test_size=0.25, random_state=42)

### class Dataset

In [7]:
class MyDataset(Dataset):

  def __init__(self, reviews, labels, tokenizer, max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    labels = self.labels[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
      truncation=True
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(labels, dtype=torch.long)
    }

### class Dataloader

In [8]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = MyDataset(
    reviews=df['review'].to_numpy(),
    labels=df['rating'].to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    shuffle=True,
  )

In [9]:
BATCH_SIZE = 64
MAX_LEN = 64

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

### Train

In [10]:
EPOCHS = 5

optimizer = AdamW(model.parameters(),
                  lr = 5e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)



In [11]:
loss_train = []
accuracy_train = []


for epoch in range(EPOCHS):


    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)

    model.train()
    for batch in tqdm(train_data_loader, desc=f"Training epoch {epoch + 1}/{EPOCHS}"):
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)



      # Always clear any previously calculated gradients before performing a
      # backward pass. PyTorch doesn't do this automatically because
      # accumulating the gradients is "convenient while training RNNs".
      # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
      model.zero_grad()


      # Perform a forward pass (evaluate the model on this training batch).
      # The documentation for this `model` function is here:
      # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
      # It returns different numbers of parameters depending on what arguments
      # are given and what flags are set. For our usage here, it returns
      # the loss (because we provided labels) and the "logits"--the model
      # outputs prior to activation.
      outputs = model(input_ids,
                             attention_mask=attention_mask,
                             labels=labels)
      loss = outputs.loss
      loss.backward()

      # Clip the norm of the gradients to 1.0.
      # This is to help prevent the "exploding gradients" problem.
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      optimizer.step()

      # Update the learning rate.
      scheduler.step()
      optimizer.zero_grad()

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(test_data_loader, desc=f"Testing epoch {epoch + 1}/{EPOCHS}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids,
                             attention_mask=attention_mask
                                 )

            logits = outputs.logits
            predicted = torch.argmax(logits, dim=1)


            total += labels.size(0)
            correct += (predicted == labels).sum().item()



            # # Accumulate the validation loss.
            # total_eval_loss += loss.item()

            # # Move logits and labels to CPU
            # logits = logits.detach().cpu().numpy()
            # label_ids = b_labels.to('cpu').numpy()

            # # Calculate the accuracy for this batch of test sentences, and
            # # accumulate it over all batches.
            # total_eval_accuracy += flat_accuracy(logits, label_ids)



    test_accuracy = correct / total
    accuracy_train.append(test_accuracy)
    print('Epoch [{}/{}], Loss: {:.4f}, Test Accuracy: {:.2f}%'.format(epoch + 1, EPOCHS, loss.item(), test_accuracy * 100))
    loss_train.append(loss.item())

Training epoch 1/5:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 1/5:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch [1/5], Loss: 0.3400, Test Accuracy: 84.47%


Training epoch 2/5:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 2/5:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch [2/5], Loss: 0.2690, Test Accuracy: 83.98%


Training epoch 3/5:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 3/5:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch [3/5], Loss: 0.0559, Test Accuracy: 84.81%


Training epoch 4/5:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 4/5:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch [4/5], Loss: 0.0710, Test Accuracy: 84.55%


Training epoch 5/5:   0%|          | 0/125 [00:00<?, ?it/s]

Testing epoch 5/5:   0%|          | 0/42 [00:00<?, ?it/s]

Epoch [5/5], Loss: 0.0028, Test Accuracy: 85.03%
