In [51]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from dataclasses import dataclass
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss, BCELoss
from tqdm import trange, tqdm
from dataclasses import asdict
from sklearn.model_selection import train_test_split


In [52]:
data = pd.read_csv('datasets.csv')

In [53]:

encoder = AutoModel.from_pretrained('DeepPavlov/rubert-base-cased')
tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [54]:
class Classifier(torch.nn.Module):
    def __init__(self, encoder, emb_size, num_classes):
        super().__init__()
        self.encoder = encoder
        for param in self.encoder.parameters():
            param.requires_grad = True
        self.classification_head = torch.nn.Linear(emb_size, num_classes)
        for param in self.classification_head.parameters():
            param.requires_grad = True
        
    def forward(self, input_ids, attention_mask, **kwargs):
        embs = self.encoder(input_ids, attention_mask).last_hidden_state[:, 0, :]
        print(embs)
        return self.classification_head(embs)

In [55]:
model = Classifier(encoder, 768, 3)

In [7]:
one_hot = pd.get_dummies(data['sentiment'])
one_hot = one_hot.astype(int)
data = pd.concat([data, one_hot], axis=1)
data

Unnamed: 0.1,Unnamed: 0,text,sentiment,0,1,2
0,43956,Развода на деньги нет\nНаблюдаюсь в Лайфклиник...,1,0,1,0
1,17755,Отель выбрали потому что рядом со стадионом. О...,0,1,0,0
2,20269,"Вылечили\nГноился с рождения глазик, в поликли...",1,0,1,0
3,16648,Хорошее расположение.С вокзала дошли пешком.Но...,0,1,0,0
4,27879,"Отличное месторасположение,прекрасный вид,особ...",1,0,1,0
...,...,...,...,...,...,...
210984,22100,"Мой юбилей я отмечал в ресторане "" Астория "" ....",2,0,0,1
210985,2326,"Отлично встретили, разместили в роскошном номе...",1,0,1,0
210986,10478,Была в Васаби на ст. метро Сенная . Во первых...,0,1,0,0
210987,4028,Ребята не стоит смотреть этот фильм. Вы молоды...,0,1,0,0


In [8]:
def replace_int_with_list(value):
    m = [0, 0, 0]
    m[value] = 1
    return m

data['sentiment'] = data['sentiment'].apply(replace_int_with_list)


In [56]:
data

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,43956,Развода на деньги нет\nНаблюдаюсь в Лайфклиник...,1
1,17755,Отель выбрали потому что рядом со стадионом. О...,0
2,20269,"Вылечили\nГноился с рождения глазик, в поликли...",1
3,16648,Хорошее расположение.С вокзала дошли пешком.Но...,0
4,27879,"Отличное месторасположение,прекрасный вид,особ...",1
...,...,...,...
210984,22100,"Мой юбилей я отмечал в ресторане "" Астория "" ....",2
210985,2326,"Отлично встретили, разместили в роскошном номе...",1
210986,10478,Была в Васаби на ст. метро Сенная . Во первых...,0
210987,4028,Ребята не стоит смотреть этот фильм. Вы молоды...,0


In [57]:
train_size = int(0.9 * len(data))
val_size  = len(data) - train_size

train, val = train_test_split(data, test_size=0.1)


In [58]:

print(type(train))

<class 'pandas.core.frame.DataFrame'>


In [60]:
@dataclass
class SentimentElement:
    comment: str
    label: int


class SentimentDataset(Dataset):
    def __init__(self, pd_dataframe: pd.DataFrame):
        self.df = pd_dataframe
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx) -> SentimentElement:
        elem = self.df.iloc[idx]
        return SentimentElement(
            comment=elem['text'],
            label=int(elem['sentiment'])
        )

In [61]:
train_dataset = SentimentDataset(train)
test_dataset = SentimentDataset(val)

In [70]:
@dataclass
class SentimentBatch:
    input_ids: torch.Tensor
    attention_mask: torch.Tensor
    label: torch.Tensor

        
class SentimentCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, elements):
        tokenized = tokenizer(
            [elem.comment for elem in elements], 
            padding=True, 
            truncation=True, 
            max_length=512, 
            return_tensors='pt',
        )
        
        return SentimentBatch(
            input_ids=tokenized['input_ids'],
            attention_mask=tokenized['attention_mask'],
            label=torch.tensor([elem.label for elem in elements]),
        )
    
    
    
collator = SentimentCollator(tokenizer)
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=12, shuffle=False, collate_fn=collator)

In [71]:
train_loader.batch_size

12

In [72]:
optimizer = AdamW(model.parameters(), lr=1e-4)

In [73]:
criterion = CrossEntropyLoss()

In [89]:
mps_device = torch.device("mps")

model.to(mps_device)
criterion.to(mps_device)
#tokenizer.to(mps_device)

mps_device

device(type='mps')

In [None]:
prediction = model(**asdict(batch)).view(prediction.size(0),-1).to('mps')
loss = criterion(prediction, batch.label.to('mps'))

In [46]:
import torch.nn.functional as F


In [50]:
prediction

tensor([[[ 0.2074,  0.2733,  0.0872],
         [-0.2867, -0.1120,  0.2061],
         [-0.1144,  0.1539,  0.0694],
         [ 0.3233, -0.0146, -0.0299],
         [ 0.3446,  0.2790,  0.4677],
         [ 0.1298,  0.5777,  0.0628],
         [-0.0438, -0.6091,  0.4338],
         [ 0.3650,  0.0191,  0.2037],
         [ 0.4465, -0.0883,  0.2572],
         [-0.0535, -0.1609, -0.1810],
         [-0.0766, -0.0496, -0.0999],
         [ 0.4525, -0.5843,  0.4394],
         [-0.2169,  0.4367,  0.2655],
         [-1.2268,  0.5912,  0.6448],
         [-0.5805, -0.1736, -0.1385],
         [-0.0702,  0.0591, -0.1352],
         [-0.5544,  0.3907,  0.1145]]], grad_fn=<ViewBackward0>)

In [90]:

epochs = 100

for epoch in range(epochs):
    for batch in tqdm(train_loader, total=len(train_loader)):
        optimizer.zero_grad()
        prediction = model(**asdict(batch))
        loss = criterion(prediction, batch.label)
        loss.backward()
        optimizer.step()

  0%|          | 0/15825 [00:00<?, ?it/s]


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [70]:
target = torch.rand(3, 2, requires_grad=False)
target

tensor([[0.5658, 0.5769],
        [0.5943, 0.8205],
        [0.3587, 0.7495]])