### Software Requirements
- Python (>=3.6)
- PyTorch (>=1.2.0) 
- Jupyter (latest)

In [1]:
# all the necessary imports
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim
import torchtext
from torchtext.data import Field, LabelField
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# set the seed
manual_seed = 572
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


### Project Description

The Dataset: [CL-Aff shared task](https://sites.google.com/view/affcon2019/cl-aff-shared-task?authuser=0). HappyDB is a dataset of about 100,000 `happy moments` crowd-sourced via Amazon’s Mechanical Turk where each worker was asked to describe in a complete sentence `what made them happy in the past 24 hours`. Each user was asked to describe three such moments. 

The Task: sociality classification. (Here we only use labelled dataset which include 10,562 labelled samples)

The dataset has already been preprocessed (tokenization, removing URLs, mentions, hashtags and so on) and placed it under ``data/happy_db`` folder in three files as ``train.tsv``, ``dev.tsv`` and ``test.tsv``.

#### 1. Preprocessing

In [4]:
def whitespace_tokenize(text):
    return text.strip().split()

In [5]:
### Define TorchText's Fields for tweet text and label respectively to handle how data should be processed
TEXT = Field(sequential=True, tokenize=whitespace_tokenize, lower=True)
LABEL = Field(sequential=False, unk_token=None)

In [6]:
### Process tsv files using TabularDataset class and Fields
train, val, test = TabularDataset.splits(
               path="data/happy_db/", # the root directory where the data lies
               train='train.tsv', validation="dev.tsv", test="test.tsv",
               format='tsv',
               skip_header=True,
               fields=[('tweet', TEXT), ('label', LABEL)])

In [7]:
### Build vocabulary to map words and labels to integers
TEXT.build_vocab(train, max_size=5000)  # The maximum size of vocabulary is 5000
LABEL.build_vocab(train)

In [8]:
### Check the sizes of two vocabularies individually
TEXT_VOCAB_SIZE = len(TEXT.vocab.stoi)
LABEL_VOCAB_SIZE = len(LABEL.vocab.stoi)
print("The size of TEXT vocabularies:", TEXT_VOCAB_SIZE)  # the extra 2 vocabularies are padding and unknown
print("The size of LABEL vocabularies:", LABEL_VOCAB_SIZE)

The size of TEXT vocabularies: 5002
The size of LABEL vocabularies: 2


In [9]:
### Construct the Iterators to get the train, dev, and test splits
train_iter, val_iter, test_iter = BucketIterator.splits(
    (train, val, test), # we pass in the datasets we want the iterator to draw data from
    batch_sizes=(32,32,32), #batch size for Train, dev and Test, respectively.
    sort_key=lambda x: len(x.tweet),  # Samples are sorted by length.
    sort=True, # sorting examples in order to batch together examples with similar lengths and minimize padding. 
    sort_within_batch=True  # sorts the data within each minibatch in decreasing order according to the sort_key.
)

In [29]:
class LSTMmodel(nn.Module):
    '''LSTM classifier of the task'''
    def __init__(self, embedding_size, vocab_size, output_size, hidden_size, num_layers, dropout):
        super(LSTMmodel, self).__init__()
        
        # word embedding lookup table
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size)
        self.embedding.weight.data.normal_(0, 0.05)  # The parameters of this embedding layer are randomly initialized from a normal distribution (mean 0 and variance 0.05)
        
        # core LSTM RNN module (uni-directional)
        self.lstm_rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers) 
        
        # activation function
        self.activation_fn = nn.Tanh()
        
        # classification related modules
        self.dropout = nn.Dropout(p=dropout)
        self.linear_layer = nn.Linear(in_features=hidden_size, out_features=output_size, bias=True) 
        self.softmax_layer = nn.LogSoftmax(dim=-1)  # normalize along batches
        self.debug = False

    def forward(self, x):
        '''define the forward propagation logic with debug mode'''
        if self.debug:
            print("input word indices shape = ", x.size())
        out = self.embedding(x)
        if self.debug:
            print("word embeddings shape = ", out.size())
        out, _ = self.lstm_rnn(out)
        # LSTM output size: (seq_len, batch, num_directions * hidden_size)
        if self.debug:
            print("LSTM RNN output (features from last layer of RNN for all timesteps) shape = ", out.size())
        out = out[-1]  # the last hidden of last layer in shape (batch, num_directions * hidden_size)
        if self.debug:
            print("Tweet embeddings or RNN output (features from last layer of RNN for the last timestep only) shape = ", out.size())
        out = self.activation_fn(out)
        out = self.dropout(out)
        if self.debug:
            print("Activation function output shape = ", out.size())
        out = self.linear_layer(out)
        if self.debug:
            print("linear layer output shape = ", out.size())
        out = self.softmax_layer(out)
        if self.debug:
            print("softmax layer output shape = ", out.size())
        return out

In [58]:
torch.manual_seed(manual_seed)
torch.cuda.manual_seed(manual_seed)

# hyperparameters
WORD_VEC_SIZE = 300  # represent each token in a 300-dimentional vector
HIDDEN_SIZE = 500  
MAX_EPOCHS = 10 # number of passes over the training data
NUM_LAYERS = 2
LEARNING_RATE = 0.001
DROPOUT = 0.1

# Instantiate the model with two uni-directional LSTM layers, each layer having 500 hidden units
model = LSTMmodel(embedding_size=WORD_VEC_SIZE, vocab_size=TEXT_VOCAB_SIZE, output_size=LABEL_VOCAB_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=DROPOUT)
model.to(device)
print(model)

LSTMmodel(
  (embedding): Embedding(5002, 300)
  (lstm_rnn): LSTM(300, 500, num_layers=2)
  (activation_fn): Tanh()
  (dropout): Dropout(p=0.1, inplace=False)
  (linear_layer): Linear(in_features=500, out_features=2, bias=True)
  (softmax_layer): LogSoftmax(dim=-1)
)


In [59]:
### Create an SGD optimizer for training
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [60]:
### Training and Evaluation
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def train(loader, model, criterion, optimizer, device):
    total_loss = 0.0
    # iterate throught the data loader
    num_sample = 0
    for batch in loader:
        # load the current batch
        batch_input = batch.tweet
        batch_output = batch.label
        
        batch_input = batch_input.to(device)
        batch_output = batch_output.to(device)
        # forward propagation
        model_outputs = model(batch_input)
        # compute the loss
        cur_loss = criterion(model_outputs, batch_output)
        total_loss += cur_loss.item()

        # backward propagation
        # clear the buffer
        optimizer.zero_grad()
        # compute the gradients
        cur_loss.backward()
        # update the weights
        optimizer.step()

        num_sample += batch_output.shape[0]
    return total_loss/num_sample

# evaluation logic based on classification accuracy
def evaluate(loader, model, criterion, device):
    all_pred=[]
    all_label = []
    with torch.no_grad(): # impacts the autograd engine and deactivate it. reduces memory usage and speeds up computation
        for batch in loader:
             # load the current batch
            batch_input = batch.tweet
            batch_output = batch.label

            batch_input = batch_input.to(device)
            # forward propagation
            model_outputs = model(batch_input)
            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(model_outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)
            all_label.extend(batch_output)
            
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return accuracy,f1score

In [61]:
### start the training
best_f1 = 0
best_epoch = 0
for epoch in range(MAX_EPOCHS):
    # train the model for one pass over the data
    train_loss = train(train_iter,model,criterion,optimizer,device)  
    # compute the training accuracy
    train_acc, train_f1 = evaluate(train_iter,model,criterion,device)
    # compute the validation accuracy
    val_acc, val_f1 = evaluate(val_iter,model,criterion,device)
    
    if val_f1 > best_f1:
        best_f1 = val_f1
        best_epoch = epoch+1

        # save model, optimizer, and number of epoch to a dictionary
        model_save = {
            'epoch': epoch,  # number of epoch
            'model_state_dict': model.state_dict(),  # model parameters 
            'optimizer_state_dict': optimizer.state_dict(),  # save optimizer 
            'loss': train_loss  # training loss
        }

        # save model 
        torch.save(model_save, "./ckpt/best_model.pt")
    # print the loss for every epoch
    print('Epoch [{}/{}], Loss: {:.4f}, Training Accuracy: {:.4f}, Training F1: {:.4f}, Validation Accuracy: {:.4f}, Validation F1: {:.4f}'.format(epoch+1, MAX_EPOCHS, train_loss, train_acc, train_f1, val_acc, val_f1))
print("Best validation score for iterations #{}: {}".format(best_epoch,best_f1))

Epoch [1/10], Loss: 0.0114, Training Accuracy: 0.9259, Training F1: 0.9258, Validation Accuracy: 0.8873, Validation F1: 0.8869
Epoch [2/10], Loss: 0.0072, Training Accuracy: 0.9457, Training F1: 0.9456, Validation Accuracy: 0.8949, Validation F1: 0.8943
Epoch [3/10], Loss: 0.0051, Training Accuracy: 0.9618, Training F1: 0.9617, Validation Accuracy: 0.8873, Validation F1: 0.8869
Epoch [4/10], Loss: 0.0039, Training Accuracy: 0.9657, Training F1: 0.9656, Validation Accuracy: 0.8930, Validation F1: 0.8922
Epoch [5/10], Loss: 0.0035, Training Accuracy: 0.9667, Training F1: 0.9667, Validation Accuracy: 0.8788, Validation F1: 0.8785
Epoch [6/10], Loss: 0.0028, Training Accuracy: 0.9740, Training F1: 0.9739, Validation Accuracy: 0.8797, Validation F1: 0.8795
Epoch [7/10], Loss: 0.0023, Training Accuracy: 0.9587, Training F1: 0.9587, Validation Accuracy: 0.8627, Validation F1: 0.8626
Epoch [8/10], Loss: 0.0021, Training Accuracy: 0.9827, Training F1: 0.9827, Validation Accuracy: 0.8797, Valida

In [75]:
### Evaluate the best trained model on test set
model2 = LSTMmodel(embedding_size=WORD_VEC_SIZE, vocab_size=TEXT_VOCAB_SIZE, output_size=LABEL_VOCAB_SIZE, hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, dropout=DROPOUT)
model2.to(device)
# load checkpoint 
checkpoint = torch.load("./ckpt/best_model.pt")

# assign the parameters of checkpoint to this new model
model2.load_state_dict(checkpoint['model_state_dict'])
model2.to(device)

LSTMmodel(
  (embedding): Embedding(5002, 300)
  (lstm_rnn): LSTM(300, 500, num_layers=2)
  (activation_fn): Tanh()
  (dropout): Dropout(p=0.1, inplace=False)
  (linear_layer): Linear(in_features=500, out_features=2, bias=True)
  (softmax_layer): LogSoftmax(dim=-1)
)

In [86]:
# evaluate 
test_acc, test_f1 = evaluate(test_iter,model2,criterion,device)
print('Test Accuracy: {:.3f}%, Test F1-score: {:.3f}%'.format(test_acc*100, test_f1*100))

Test Accuracy: 88.352%, Test F1-score: 88.269%
