In [56]:
! pip install unidecode



In [57]:
import torch
import torch.nn as nn

seed = 1
torch.manual_seed(seed)

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
import unidecode

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
from google.colab import drive

while not os.path.exists('/content/drive/MyDrive/datasets/dataset'):
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/datasets/dataset



In [59]:
dataset_path = 'all-data.csv'
headers = ['sentiment', 'content']
df = pd.read_csv(
    dataset_path,
    names=headers,
    encoding='ISO-8859-1'
)

In [60]:
classes = {
    class_name : idx for idx, class_name in enumerate(df['sentiment'].unique().tolist())
}
df['sentiment'] = df['sentiment'].apply(lambda x: classes[x])

In [61]:
english_stop_words = stopwords.words('english')
stemmer = PorterStemmer()

def text_normalize(text):
  text = text.lower()
  text = unidecode.unidecode(text)
  text = text.strip()
  text = re.sub(r'[^\w\s]', '', text)
  text = ' '.join([word for word in text.split(' ') if word not in english_stop_words])
  text = ' '.join([stemmer.stem(word) for word in text.split(' ')])
  return text

df['content'] = df['content'].apply(lambda x: text_normalize(x))

In [62]:
vocab = []
for sentence in df['content'].tolist():
    tokens = sentence.split()
    for token in tokens:
        vocab.append(token)
vocab.append('UNK')
vocab.append('PAD')
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocal_size = len(vocab)

In [63]:
def transform(text, word_to_idx, max_seq_len):
  tokens = []
  for w in text.split():
    try:
      w_ids = word_to_idx[w]
    except:
      w_ids = word_to_idx['UNK']
    tokens.append(w_ids)

  if len(tokens) < max_seq_len:
    tokens += [word_to_idx['PAD']] * (max_seq_len - len(tokens))

  elif len(tokens) > max_seq_len:
    tokens = tokens[:max_seq_len]

  return tokens

In [64]:
val_size = 0.2
test_size = 0.125
is_shuffle = True
texts = df['content'].tolist()
labels = df['sentiment'].tolist()

X_train, X_val, y_train, y_val = train_test_split(
    texts,
    labels,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle
)

X_val, X_test, y_val, y_test = train_test_split(
    X_val,
    y_val,
    test_size=val_size,
    random_state=seed,
    shuffle=is_shuffle
)

In [65]:
class FinacialNews(Dataset):
  def __init__(self, X, y, word_to_idx, max_seq_len, transform=None):
    self.texts = X
    self.labels = y
    self.word_to_idx = word_to_idx
    self.max_seq_len = max_seq_len
    self.transform = transform

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]

    if self.transform:
      text = self.transform(
          text,
          self.word_to_idx,
          self.max_seq_len
          )
    text = torch.tensor(text)

    return text, label

In [66]:
max_seq_len = 32

train_dataset = FinacialNews(
    X_train, y_train,
    word_to_idx = word_to_idx,
    max_seq_len = max_seq_len,
    transform=transform
)

val_dataset = FinacialNews(
    X_val,
    y_val,
    word_to_idx = word_to_idx,
    max_seq_len = max_seq_len,
    transform=transform
)

test_dataset = FinacialNews(
    X_test,
    y_test,
    word_to_idx = word_to_idx,
    max_seq_len = max_seq_len,
    transform=transform
)

train_batch_size = 128
test_batch_size = 8

train_loader = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=test_batch_size,
    shuffle=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=test_batch_size,
    shuffle=False
)

In [67]:
class SentimentClassifier(nn.Module):
  def __init__(
      self,
      vocab_size, embedding_dim, hidden_size, n_layers, n_classes, dropout_prob
  ):
    super(SentimentClassifier, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.bilstm = nn.LSTM(embedding_dim, hidden_size, n_layers, batch_first = True, bidirectional = True)
    self.norm = nn.LayerNorm(hidden_size*2)
    self.dropout = nn.Dropout(dropout_prob)
    self.fc1 = nn.Linear(hidden_size*2, 16)
    self.relu = nn.ReLU()
    self.fc2 = nn.Linear(16, n_classes)

  def forward(self, x):
    x = self.embedding(x)
    x, hn = self.bilstm(x)
    x = x[:, -1, :]
    x = self.norm(x)
    x = self.dropout(x)
    x = self.fc1(x)
    x = self.relu(x)
    x = self.fc2(x)
    return x


In [68]:
n_classes = len(list(classes.keys()))
embedding_dim = 64
hidden_size = 64
n_layers = 2
dropout_prob = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SentimentClassifier(
    vocal_size,
    embedding_dim,
    hidden_size,
    n_layers,
    n_classes,
    dropout_prob
).to(device)

In [69]:
lr = 1e-4
epochs = 50

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [70]:
def fit(model, train_loader, val_loader, criterion, optimizer, device, epochs):
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        batch_train_losses = []

        model.train()
        for idx, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            batch_train_losses.append(loss.item())

        train_loss = sum(batch_train_losses) / len(batch_train_losses)
        train_losses.append(train_loss)

        val_loss, val_acc = evaluate(model, val_loader, criterion, device)
        val_losses.append(val_loss)

        print(f'Epoch {epoch + 1}/{epochs}, train loss: {train_loss}, val loss: {val_loss}')
    return train_losses, val_losses

def evaluate(model, dataloader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    losses = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            losses.append(loss.item())
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    loss = sum(losses) / len(losses)
    acc = correct / total

    return loss, acc

In [71]:
train_losses, val_losses = fit(model, train_loader, val_loader, criterion, optimizer, device, epochs)

Epoch 1/50, train loss: 1.0717612101185707, val loss: 0.9923506796974497
Epoch 2/50, train loss: 0.9522027661723476, val loss: 0.9260871348921785
Epoch 3/50, train loss: 0.9284435741363033, val loss: 0.924053405978016
Epoch 4/50, train loss: 0.9257329894650367, val loss: 0.9236249512003869
Epoch 5/50, train loss: 0.9256860402322584, val loss: 0.9234840703993729
Epoch 6/50, train loss: 0.9278841922360082, val loss: 0.9231370023845398
Epoch 7/50, train loss: 0.9254538609135535, val loss: 0.9226802089779648
Epoch 8/50, train loss: 0.9307247361829204, val loss: 0.9222519108929585
Epoch 9/50, train loss: 0.9297684604121793, val loss: 0.9220819252053487
Epoch 10/50, train loss: 0.9212089892356626, val loss: 0.9219576058928499
Epoch 11/50, train loss: 0.9231208620532867, val loss: 0.9214286902516159
Epoch 12/50, train loss: 0.9229273853763458, val loss: 0.9216717859518897
Epoch 13/50, train loss: 0.9241092358866045, val loss: 0.9211549279616051
Epoch 14/50, train loss: 0.9196426368528797, val

In [72]:
val_loss, val_acc = evaluate(model, val_loader, criterion, device)
test_loss, test_acc = evaluate(model, test_loader, criterion, device)

print(f'Val loss: {val_loss}, Val acc: {val_acc}')
print(f'Test loss: {test_loss}, Test acc: {test_acc}')

Val loss: 0.8868538445418643, Val acc: 0.6288659793814433
Test loss: 0.8647020983695984, Test acc: 0.6443298969072165
