In [200]:
#hugging
import datasets
from transformers import AutoTokenizer

#pytorch
import torch
from torch import nn
import torch.optim as optim
#from torchsummary import summary

import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

GPU: NVIDIA GeForce RTX 3050 is available.


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Processing Dataset

In [4]:
ds = datasets.load_dataset('google-research-datasets/go_emotions', 'raw')
CLASSES = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

In [5]:
def get_X_y_from_ds(ds: datasets.dataset_dict, CLASSES: list[str], split: str ='train') -> tuple[list[str], list[int]]:
    y = []
    for c in CLASSES:
        y.append(ds[split][c])
    return ds[split]['text'], np.argmax(y, axis=0)

In [6]:
X, y = get_X_y_from_ds(ds, CLASSES)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True)

# Preparing model

In [7]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token



In [517]:
class RNN(nn.Module):
    
    def __init__(self, embedding_dim, hidden_dim, vocab_size, num_layers, classes):
        super(RNN, self).__init__()

        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        #Хочет LxBxE_dims
        #При batch_first=True хочет BxLxE_dims
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers)

        # The linear layer that maps from hidden state space to tag space
        self.FC = nn.Linear(hidden_dim, classes)

                        #n_layers x B x H_out
    
    def forward(self, text):
        # text B x len
        embeddings = self.embeddings(text) # B x len x H(embedding_dim)
        inp = embeddings.view(-1, len(text), self.embedding_dim) # L x B x H_in так нужно сделать, чтобы в LSTM кинуть
        #self.hidden = self.init_hidden_state(text) #h_0 и c_0 n_layers x B x H_out

        lstm_out, (h, c) = self.lstm(inp) #out: L x B x Hout h_t: n_l x B x H_out
        out = lstm_out[-1]

        
        # In each timestep of an LSTM the input goes through a simple neural network and the output gets passed to the next timestep. The output out of function
        # out, (ht, ct) = self.lstm_nets(X)
        # contains a list of ALL outputs (i.e the output of the neural networks of every timestep). Yet, in classification, you mostly only really care about the LAST output. You can get it like this:
        # out = out[:, -1]
        # https://stackoverflow.com/questions/72667646/how-to-connect-a-lstm-layer-to-a-linear-layer-in-pytorch
        
        out = self.FC(out) 
        return out
        
        

In [524]:
#E = 32
#H = 128
#len(tokenz) = 64
rnn = RNN(embedding_dim=32, 
          hidden_dim=32, 
          vocab_size=502576, 
          num_layers=5, 
          classes=28).to('cuda')

text = tokenizer(X[0:2], return_tensors='pt', padding='max_length', max_length=32, truncation=True)
x = rnn(text['input_ids'].to('cuda'))

In [525]:
x

tensor([[ 0.1848,  0.2165, -0.1298, -0.1484,  0.1609,  0.0748, -0.1140, -0.1392,
          0.0302, -0.0084,  0.1534,  0.0493, -0.1330, -0.0298,  0.0810, -0.0614,
          0.1113,  0.0812, -0.2749,  0.0076, -0.1517, -0.0683, -0.1889, -0.1362,
         -0.1175, -0.1181,  0.0526,  0.0017],
        [ 0.1850,  0.2166, -0.1299, -0.1480,  0.1608,  0.0748, -0.1138, -0.1389,
          0.0305, -0.0088,  0.1536,  0.0497, -0.1330, -0.0295,  0.0813, -0.0617,
          0.1114,  0.0808, -0.2746,  0.0078, -0.1518, -0.0681, -0.1892, -0.1360,
         -0.1173, -0.1175,  0.0528,  0.0016]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

# Train

In [526]:
#embedding_dim, hidden_dim, vocab_size, num_layers, classes
model = RNN(embedding_dim=64, 
            hidden_dim=64, 
            vocab_size=502576, 
            num_layers=5, 
            classes=28)
model.to('cuda')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

In [494]:
# epochs = tqdm.trange(2, leave=False)
# for _ in epochs:
#     epoch = tqdm.trange(0, len(X_train), 100, leave=False)
#     for i in epoch:
#         model.zero_grad() #очистить градиенты

#         X, y = X_train[i: i+100], y_train[i: i+100]
#         X = tokenizer(X, return_tensors='pt', padding='max_length', max_length=64, truncation=True)['input_ids']
#         y_pred = model(X.to('cuda'))
#         y = torch.tensor(y).to('cuda')
#         loss = loss_f(y_pred, y)
#         loss.backward()
#         optimizer.step

#         #epoch.set_description(f'loss: {torch.mean(loss):.3f}')
#     #print(evaluate(X_test, y_test, model, 100))
#     acc = 0
#     for i in range(0, len(X_test), 100):
#         X, y = X_test[i: i+100], y_test[i: i+100]
#         X = tokenizer(X, return_tensors='pt', padding='max_length', max_length=64, truncation=True)['input_ids']
#         y_pred = torch.argmax(model(X.to('cuda')), dim=1)
#         y = torch.tensor(y).to('cuda')
#         acc += torch.sum(y_pred == y) / len(y_pred)
#     acc /= len(range(0, len(ds), 100))
#     print(acc)

In [531]:
X_i = tokenizer(X_train[50: 70].tolist(), return_tensors='pt', padding='max_length', max_length=64, truncation=True)['input_ids']
torch.argmax(model(X_i.to('cuda')), dim=1)

tensor([27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
        27, 27], device='cuda:0')

In [527]:
train_acc = []
val_acc = []
n_epochs = 10
rng = np.random.default_rng()
X_train = np.array(X_train)
y_train = np.array(y_train)

for epoch in range(n_epochs):  # loop over the dataset multiple times

    running_loss = 0.0
    for i in range(0, len(X_train), 500):
        idx = rng.choice(len(X_train), size=500, replace=False)
        X_i, y_i = X_train[idx], y_train[idx]
        X_i = tokenizer(X_i.tolist(), return_tensors='pt', padding='max_length', max_length=64, truncation=True)['input_ids']
        y_i = torch.Tensor(y_i).type(torch.LongTensor).to('cuda')
                
        optimizer.zero_grad()

        # forward + backward + optimize
        y_pred = model(X_i.to('cuda'))
        loss = criterion(y_pred, y_i)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 20_000 == 0:    # print every 10000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 500}')
            running_loss = 0.0

    train_acc.append(get_acc(X_train.tolist(), y_train, 500, tokenizer, model))
    val_acc.append(get_acc(X_test, y_test, 500, tokenizer, model))
    print('-'*40)
    print(f'Train accuracy {train_acc[-1]}%')
    print(f'Test accuracy {val_acc[-1]}%')
    print('-'*40)
print('Finished Training')

plt.plot(range(n_epochs), train_acc, c='r')
plt.plot(range(n_epochs), val_acc, c='b')

[1,     1] loss: 0.006708536148071289
[1, 20001] loss: 0.227436439037323
[1, 40001] loss: 0.22370239734649658
[1, 60001] loss: 0.22422976636886596
[1, 80001] loss: 0.2235818691253662
[1, 100001] loss: 0.22385192584991456
[1, 120001] loss: 0.2246911940574646
[1, 140001] loss: 0.22390100193023682
----------------------------------------
Train accuracy 26%
Test accuracy 25%
----------------------------------------
[2,     1] loss: 0.0056212182044982914
[2, 20001] loss: 0.22303049039840697
[2, 40001] loss: 0.22495844221115113
[2, 60001] loss: 0.2229218020439148
[2, 80001] loss: 0.22483359813690185
[2, 100001] loss: 0.2242973985671997
[2, 120001] loss: 0.22355601835250855
[2, 140001] loss: 0.22485375928878784
----------------------------------------
Train accuracy 26%
Test accuracy 25%
----------------------------------------
[3,     1] loss: 0.005762024879455567
[3, 20001] loss: 0.22305150318145753
[3, 40001] loss: 0.22294979572296142
[3, 60001] loss: 0.22356168508529664
[3, 80001] loss: 0

KeyboardInterrupt: 

In [217]:
print(model)

RNN(
  (word_embeddings): Embedding(502576, 128)
  (lstm): LSTM(128, 256, num_layers=3)
  (FC): Linear(in_features=256, out_features=28, bias=True)
)


In [245]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for i in range(0, len(X_train), 500):
        X_i, y_i = X_train[i: i+500], y_train[i: i+500]
        X_i = tokenizer(X_i, return_tensors='pt', padding='max_length', max_length=64, truncation=True)['input_ids']
        y_i = torch.Tensor(y_i).to('cuda')
        
        y_pred = torch.argmax(model(X_i.to('cuda')), dim=1)
        total += y_i.size(0)
        correct += (y_pred == y_i).sum().item()

print(f'Accuracy of the network on the {len(X_train)} sentences: {100 * correct // total} %')

Accuracy of the network on the 147857 sentences: 16 %


In [247]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for i in range(0, len(X_test), 500):
        X_i, y_i = X_test[i: i+500], y_test[i: i+500]
        X_i = tokenizer(X_i, return_tensors='pt', padding='max_length', max_length=64, truncation=True)['input_ids']
        y_i = torch.Tensor(y_i).to('cuda')
        
        y_pred = torch.argmax(model(X_i.to('cuda')), dim=1)
        total += y_i.size(0)
        correct += (y_pred == y_i).sum().item()
print(correct)
print(total)
print(f'Accuracy of the network on the {len(X_test)} sentences: {100 * correct // total} %')

9943
63368
Accuracy of the network on the 63368 sentences: 15 %


In [249]:
(y_pred == y_i)#.sum().item()

tensor([False, False, False, False, False,  True, False, False, False,  True,
         True, False, False, False, False, False, False, False, False, False,
        False, False,  True, False,  True, False,  True,  True, False, False,
        False, False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False,  True, False, False, False, False, False, False, False,
         True, False, False, False, False, False, False, False,  True,  True,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False,  True, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, 

In [166]:
def get_acc(X, y, step, tokenizer, model):
    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for i in range(0, len(X), step):
            X_i, y_i = X[i: i+step], y[i: i+step]
            X_i = tokenizer(X_i, return_tensors='pt', padding='max_length', max_length=64, truncation=True)['input_ids']
            y_i = torch.Tensor(y_i).to('cuda')
            
            y_pred = torch.argmax(model(X_i.to('cuda')), dim=1)
            total += y_i.size(0)
            correct += (y_pred == y_i).sum().item()
    
    #print(f'Accuracy of the network on the {len(X_test)} sentences: {100 * correct // total} %')
    
    return 100 * correct // total

In [None]:
len(y_pred[0])