### Imports and setting CONSTANTS


In [3]:
import torch.nn as nn
from torch.autograd import Variable
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

device = (
    "cuda"
    if torch.cuda.is_available()
    else "cpu"
)

torch.cuda.empty_cache()

NUM_TOKENS = 10000
EMBEDDING_SIZE = 300
HIDDEN_SIZE = 150
TARGET_SIZE = 2
BATCH_SIZE = 64
MODEL_ID = "projecte-aina/FLOR-6.3B"
NUM_LAYERS = 107
MAX_LENGTH = 107
# NUM_TOKENS = 10000
# EMBEDDING_SIZE = 300
# HIDDEN_SIZE = 300
# TARGET_SIZE = 2
# BATCH_SIZE = 20
# MODEL_ID = "projecte-aina/FLOR-6.3B"
# NUM_LAYERS = 107
# MAX_LENGTH = 107

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

### Custom LSTM Neural network

In [4]:
class CustomLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, target_size, num_layers, batch_size):
        super(CustomLSTM, self).__init__()
        
        self.embeddings = nn.Embedding(num_embeddings=input_size, embedding_dim=embedding_size)
        # self.lstm = nn.LSTM(input_size=embedding_size,hidden_size=hidden_size, num_layers=num_layers, bidirectional=True)
        self.lstm = nn.LSTM(input_size=embedding_size,hidden_size=hidden_size, num_layers=num_layers, bidirectional=False)
        # self.lastlayer = nn.Linear(hidden_size*2, target_size)
        self.lastlayer = nn.Linear(hidden_size, target_size)
      #   self.lastlayer = nn.Linear(600, 107)
       


  def forward(self, sentence):
      x = self.embeddings(sentence)
      yt, ht = self.lstm(x)
      # print("yt_shape:",yt.shape)
      yt = yt[:,-1,:]
      # print("yt_shape:",yt.shape)
      logits = self.lastlayer(yt)
      
      return logits, ht
    

### Custom Dataset

In [5]:
class MoviesDataset(Dataset):
    def __init__(self, x, y, model_max_length):
        self.x = x
        self.y = y
        self.model_max_length = model_max_length

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        text = self.x[idx]
        text_tokenized = tokenizer.encode(text, padding='max_length', max_length=self.model_max_length , truncation=True, return_tensors='pt', )
        
        return text_tokenized[0], self.y[idx]

### Training

In [6]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    for batch, (X, y) in enumerate(dataloader):
        # X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred, _ = model(X)


        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 10 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    return loss

### Prediction

In [7]:
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            # X, y = X.to(device), y.to(device)
            
            pred, _ = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

### Main method starting point

In [8]:
def main(batch_size, input_size, embedding_size, hidden_size, target_size, num_layers, epochs=None):
    dataset = pd.read_csv(
        "../datasets/IMDB-Dataset.csv", usecols=["review", "sentiment"]
    )
    dataset["sentiment"] = dataset["sentiment"].map({"positive": 1, "negative": 0})
    train, test = train_test_split(dataset, test_size=0.2)
    learning_rate = 0.003

    model = CustomLSTM(input_size, embedding_size, hidden_size, target_size, num_layers, batch_size)
    

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    train_dataset = MoviesDataset(train['review'].values, train['sentiment'].values, model_max_length=num_layers)
    test_dataset = MoviesDataset(test['review'].values, test['sentiment'].values, model_max_length=num_layers)

    
    train_dataloader = DataLoader(train_dataset, batch_size , shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size, shuffle=True)

    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")

        train_loop(train_dataloader, model, loss_fn, optimizer)
        test_loop(test_dataloader, model, loss_fn)


main(
    batch_size=BATCH_SIZE,
    input_size=50257,
    embedding_size=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    target_size=TARGET_SIZE,
    num_layers=NUM_LAYERS,
    epochs=10,
)

0
loss: 0.695429  [    0/40000]
1
2
3
4
5
6
7
8
9
10
loss: 0.696505  [  640/40000]
11
12
13
14
15
16
17
18
19
20
loss: 0.677874  [ 1280/40000]
21
22
23
24
25
26
27
28
29
30
loss: 0.700346  [ 1920/40000]
31
32
33
34
35
36
37
38
39
40
loss: 0.692694  [ 2560/40000]
41
42
43
44
45
46
47
48
49
50
loss: 0.692242  [ 3200/40000]
51
52
53
54
55
56
57
58
59
60
loss: 0.695620  [ 3840/40000]
61
62
63
64
65
66
67
68
69
70
loss: 0.692510  [ 4480/40000]
71
72
73
74
75
76
77
78
79
80
loss: 0.691266  [ 5120/40000]
81
82
83
84
85
86
87
88
89
90
loss: 0.693606  [ 5760/40000]
91
92
93
94
95
96
97
98
99
100
loss: 0.690974  [ 6400/40000]
101
102
103
104
105
106
107
108
109
110
loss: 0.692873  [ 7040/40000]
111
112
113
114
115
116
117
118
119
120
loss: 0.692388  [ 7680/40000]
121
122
123
124
125
126
127
128
129
130
loss: 0.694931  [ 8320/40000]
131
132
133
134
135
136
137
138
139
140
loss: 0.692915  [ 8960/40000]
141
142
143
144
145
146
147
148
149
150
loss: 0.694522  [ 9600/40000]
151
152
153
154
155
156
15