In [2]:
%cd ../..

/home/pristalovya/Документы/nlp-coursework


In [6]:
from datasets_ import DatasetLoader

from datasets import load_dataset

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.utils import resample

import matplotlib.pyplot as plt
%matplotlib notebook

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import AdamW

from tqdm import tqdm
import numpy as np
import pandas as pd

import sys
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from nltk import WhitespaceTokenizer

from IPython.display import clear_output

from transformers import (
    pipeline,                       
    AutoModelForSequenceClassification,                       
    BertForSequenceClassification,                       
    AutoTokenizer,
    AdamW,
)

In [3]:
train, test = DatasetLoader.load_reviews_Review_Label_dataset(train_test_split=True,
                                                              classnames_to_int=True,
                                                              remove_neutral_class=True,
                                                              show_path=True,)
train.label[train['label'] == 2] = 1
test.label[test['label'] == 2] = 1

print(train.label.value_counts())
train = pd.concat([train, resample(train[train.label == 0], n_samples=41608, random_state=42)])
print(train.label.value_counts())

print(train.shape, test.shape)

/home/pristalovya/Документы/nlp-coursework/data/reviews_Review_Label/reviews_Review_Label.csv
1    48477
0     6869
Name: label, dtype: int64
0    48477
1    48477
Name: label, dtype: int64
(96954, 2) (23721, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.label[train['label'] == 2] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.label[test['label'] == 2] = 1


In [4]:
test

Unnamed: 0,review,label
25749,Большое количество фильмов советского кинемато...,1
44489,"Тяжело ответить на вопрос, что же такое Догвил...",1
53162,"В наше время такие героини, как скажем наприме...",0
25843,В 2001 году нам довелось познакомиться с новой...,1
44609,"«Это фильм?», «У них не хватило денег на декор...",1
...,...,...
14104,- Через столько лет?\r\n- Всегда\r\n\r\nБезусл...,1
22232,"После просмотра трейлера, я был под большим вп...",1
73314,"Многие не верят, но я легко подключаюсь к прои...",1
47848,"Как часто нам нужна поддержка? Да, пожалуй, оч...",1


In [248]:
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_model_input_length=512):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_model_input_length = max_model_input_length
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        review = self.reviews.iloc[idx]
        label = self.labels.iloc[idx]
        review_tokenized = self.tokenizer(
            review,
            add_special_tokens=True,
            max_length=self.max_model_input_length,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )
        input_ids = review_tokenized['input_ids'].flatten()
        attn_mask = review_tokenized['attention_mask'].flatten()
        
        return {
            'review': review,
            'input_ids': input_ids,
            'attention_mask': attn_mask,
            'label': label,
        }
        

In [275]:
class BertLogRegClassifier:
    def __init__(self, checkpoint, n_classes=2):
        
        self.model = BertForSequenceClassification.from_pretrained(checkpoint)
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        
        self.max_len = 512
        self.out_features = self.model.bert.pooler.dense.out_features
        self.model.dropout = torch.nn.Sequential()
        self.model.classifier = torch.nn.Sequential()
        
        self.classifier = LogisticRegression(max_iter=1000, n_jobs=-1, random_state=42)        
                
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)    
        

    def fit(self, train_dataloader: torch.utils.data.DataLoader):
        self.model.eval()
        
        self.all_input_embeddings = np.array([])
        
        t = tqdm(train_dataloader, file=sys.stdout, ncols=100)

        for data in t:
            
            with torch.no_grad():
                input_ids = data['input_ids'].to(self.device)
                attention_mask = data['attention_mask'].to(self.device).to(float)
                labels = data['label'].to(self.device)


                embeddings = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                ).logits

                self.all_input_embeddings = np.append(self.all_input_embeddings, embeddings.cpu().numpy())
    
        self.all_input_embeddings = self.all_input_embeddings.reshape(-1, self.out_features)
    
        self.classifier.fit(self.all_input_embeddings, train_dataloader.dataset.labels)            

            
    def predict(self, test_input):
        self.model.eval()
        
        self.all_output_embeddings = np.array([])
        
        t = tqdm(test_input, file=sys.stdout, ncols=100)

        for data in t:
            
            with torch.no_grad():
                input_ids = data['input_ids'].to(self.device)
                attention_mask = data['attention_mask'].to(self.device).to(float)
                labels = data['label'].to(self.device)


                embeddings = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                ).logits

                self.all_output_embeddings = np.append(self.all_output_embeddings, embeddings.cpu().numpy())
    
        
        self.all_output_embeddings = self.all_output_embeddings.reshape(-1, self.out_features)

        return self.classifier.predict(self.all_output_embeddings)   


In [281]:
clf = BertLogRegClassifier('blanchefort/rubert-base-cased-sentiment-rusentiment')

train_dataset = ReviewDataset(train.review, train.label, clf.tokenizer)
test_dataset = ReviewDataset(test.review, test.label, clf.tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [282]:
for param in clf.model.bert.parameters():
    param.requires_grad = False
print('All parameters:', sum(p.numel() for p in clf.model.parameters()))
print('Trainable parameters:', sum(p.numel() for p in clf.model.parameters() if p.requires_grad))

All parameters: 177853440
Trainable parameters: 0


In [283]:
clf.fit(train_dataloader)

100%|█████████████████████████████████████████████████████████| 1515/1515 [1:13:14<00:00,  2.90s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just g

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [284]:
preds = clf.predict(test_dataloader)
print(classification_report(test.label, preds))

100%|█████████████████████████████████████████████████████████████| 371/371 [17:39<00:00,  2.85s/it]
              precision    recall  f1-score   support

           0       0.13      0.52      0.21      2979
           1       0.88      0.51      0.65     20742

    accuracy                           0.51     23721
   macro avg       0.51      0.52      0.43     23721
weighted avg       0.79      0.51      0.59     23721

