In [0]:
import torchtext
from torchtext import data
import spacy
import re

In [0]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torchtext import vocab

In [0]:
nlp = spacy.load('en', disable=['parser', 'tagger', 'ner'])

In [0]:
def tokenizer(s): 
    return [w.text.lower() for w in nlp(tweet_clean(s))]

def tweet_clean(text):
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric character
    text = re.sub(r'https?:/\/\S+', ' ', text) # remove links
    return text.strip()

In [0]:
txt_field = data.Field(tokenize=tokenizer, include_lengths=True)
label_field = data.Field(sequential=False,use_vocab=False, pad_token=None,unk_token=None)

In [0]:
train_val_fields = [
    ('ItemID', None), # we dont need this, so no processing
    ('Sentiment', label_field), # process it as label
    ('SentimentSource', None), # we dont need this, so no processing
    ('SentimentText', txt_field) # process it as text
]

In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#tds = data.TabularDataset(path="/content/drive/My Drive/data/Sentiment Analysis Dataset.csv", format="csv", fields=train_val_fields,skip_header=True)
trainds, valds = data.TabularDataset.splits(path="/content/drive/My Drive/data", format="csv", train="Sentiment Analysis Dataset.csv", validation="Sentiment Analysis Dataset.csv", fields=train_val_fields,skip_header=True)

In [0]:
class TextSentiment(nn.Module):
  def __init__(self, vocab_size, embed_dim, num_class):
    super().__init__()
    self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
    self.fc = nn.Linear(embed_dim, num_class)
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.fc.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.zero_()

  def forward(self, text, offsets):
    embedded = self.embedding(text, offsets)
    return self.fc(embedded)

In [0]:
txt_field.build_vocab(trainds, max_size=100000)

In [0]:
VOCAB_SIZE = len(txt_field.vocab)
EMBED_DIM = 32
NUN_CLASS = 2
model = TextSentiment(VOCAB_SIZE, EMBED_DIM, NUN_CLASS)

In [0]:
def generate_batch(batch):
  label = torch.tensor([int(entry.Sentiment) for entry in batch], dtype=torch.long)
  text = []
  for entry in batch:
    temp = torch.tensor([txt_field.vocab.stoi[w] for w in entry.SentimentText], dtype=torch.long)
    text.append(temp)

  offsets = [0] + [len(entry.SentimentText) for entry in batch]
  
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text = torch.cat(text)
  return text, offsets, label

In [0]:
def train_func(sub_train_):
  train_loss = 0
  train_acc = 0

  data = DataLoader(trainds, batch_size=32, shuffle=True,
                      collate_fn=generate_batch)

  for i, (text, offsets, cls) in enumerate(data):
    optimizer.zero_grad()
    text, offsets, cls = text, offsets, cls
    output = model(text, offsets)
    loss = criterion(output, cls)
    train_loss += loss.item()
    loss.backward()
    optimizer.step()
    train_acc += (output.argmax(1) == cls).sum().item()
  return train_loss / len(sub_train_), train_acc / len(sub_train_)


In [0]:
import time
import torch.optim as optim

N_EPOCHS = 5
min_valid_loss = float('inf')

#criterion = nn.CrossEntropyLoss()
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=.001)

In [94]:
for epoch in range(N_EPOCHS):

  start_time = time.time()
  train_loss, train_acc = train_func(trainds)

  secs = int(time.time() - start_time)
  mins = secs / 60
  secs = secs % 60

  print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
  print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')

Epoch: 1  | time in 1 minutes, 55 seconds
	Loss: 0.0216(train)	|	Acc: 52.6%(train)
Epoch: 2  | time in 1 minutes, 45 seconds
	Loss: 0.0211(train)	|	Acc: 58.3%(train)
Epoch: 3  | time in 1 minutes, 45 seconds
	Loss: 0.0207(train)	|	Acc: 60.6%(train)
Epoch: 4  | time in 1 minutes, 40 seconds
	Loss: 0.0204(train)	|	Acc: 62.1%(train)
Epoch: 5  | time in 1 minutes, 39 seconds
	Loss: 0.0201(train)	|	Acc: 63.4%(train)
