<a href="https://colab.research.google.com/github/andrea-gasparini/nlp-aspect-based-sentiment-analysis/blob/master/hw2/stud/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aspect-Based Sentiment Analysis (ABSA)

Second homework of the Natural Language Processing course 2021 @ Sapienza University of Rome.

Prof. Roberto Navigli

MSc in Computer Science

**Author**: Andrea Gasparini - 1813486

## Setup the environment

Note that this notebook can be directly executed in Colab by clicking the button above.

Otherwise, it is supposed to be placed in the `hw2` directory before running it.

In [1]:
import os
import sys

# if running on colab
if 'google.colab' in sys.modules:

	# set up GitHub
	GITHUB_TOKEN = ""
	GITHUB_USER = "andrea-gasparini"
	assert GITHUB_TOKEN != "" and GITHUB_USER != ""

	# clone the repository from GitHub
	! git clone https://{GITHUB_USER}:{GITHUB_TOKEN}@github.com/andrea-gasparini/nlp-aspect-based-sentiment-analysis
	! mv nlp-aspect-based-sentiment-analysis/hw2/* .
	! mv nlp-aspect-based-sentiment-analysis/requirements.txt .
	! rm -rf nlp-aspect-based-sentiment-analysis
	! pip install -r requirements.txt --quiet --no-cache-dir

	# mount drive directories
	from google.colab import drive
	drive.mount('/content/drive', force_remount=True)

	ROOT_DIR = '/content/drive/My Drive/data/nlp/hw2/'
	DATA_DIR = f'{ROOT_DIR}data/'
	MODELS_DIR = f'{ROOT_DIR}models/'
	EMBEDDINGS_DIR = f'{ROOT_DIR}embeddings/'

else:

	relative_cwd_last_two = os.path.sep.join(os.getcwd().split(os.path.sep)[-2:])
	assert os.path.basename(relative_cwd_last_two) == "hw2",\
		f"This notebook is supposed to be runned only from \"hw2/\" or Google Colab, not from {relative_cwd_last_two}/"

	ROOT_DIR = '../'
	DATA_DIR = f'{ROOT_DIR}data/'
	MODELS_DIR = f'{ROOT_DIR}model/'
	EMBEDDINGS_DIR = f'{MODELS_DIR}embeddings/'

assert os.path.isdir(ROOT_DIR), f"{ROOT_DIR} is not a valid directory"

for dir in [DATA_DIR, MODELS_DIR, EMBEDDINGS_DIR]:
	! mkdir -p {dir.replace(' ', '\ ')}

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Pre-trained static embeddings
GLOVE_DIR = f"{EMBEDDINGS_DIR}glove/"
GLOVE_FILENAME = "glove.6B.300d.txt"

## Laptop datasets
LAPTOP_TRAIN_JSON = os.path.join(DATA_DIR, "laptops_train.json")
LAPTOP_DEV_JSON = os.path.join(DATA_DIR, "laptops_dev.json")
assert os.path.isfile(LAPTOP_TRAIN_JSON), f"{LAPTOP_TRAIN_JSON} does not contain a valid train dataset"
assert os.path.isfile(LAPTOP_DEV_JSON), f"{LAPTOP_DEV_JSON} does not contain a valid development dataset"

## Restaurant datasets
RESTAURANT_TRAIN_JSON = os.path.join(DATA_DIR, "restaurants_train.json")
RESTAURANT_DEV_JSON = os.path.join(DATA_DIR, "restaurants_dev.json")
assert os.path.isfile(RESTAURANT_TRAIN_JSON), f"{RESTAURANT_TRAIN_JSON} does not contain a valid train dataset"
assert os.path.isfile(RESTAURANT_DEV_JSON), f"{RESTAURANT_DEV_JSON} does not contain a valid development dataset"

### Imports

In [2]:
import os
import wandb
import torch
import torchtext
import pytorch_lightning as pl

from evaluate import read_dataset

from nltk import TreebankWordTokenizer
from pytorch_lightning.loggers import WandbLogger

from stud.dataset import ABSADataModule
from stud.pl_models import PlAspectClassifier
from stud import utils, constants as const

### Reproducibility stuff

In [3]:
SEED = 42 # @param {type:"integer"}

pl.seed_everything(SEED)
torch.backends.cudnn.deterministic = True  # will use only deterministic algorithms

Global seed set to 42


### Wandb logging

In [4]:
WANDB_PROJECT_AB = 'nlp_hw2-AB'
WANDB_PROJECT_CD = 'nlp_hw2-CD'
WANDB_KEY=""
wandb.login(key=WANDB_KEY)

[34m[1mwandb[0m: Currently logged in as: [33mandreagasparini[0m (use `wandb login --relogin` to force relogin)


True

## Data preparation

In [5]:
utils.nltk_downloads()

data_module = ABSADataModule(read_dataset(LAPTOP_TRAIN_JSON) + read_dataset(RESTAURANT_TRAIN_JSON),
                             read_dataset(LAPTOP_DEV_JSON) + read_dataset(RESTAURANT_DEV_JSON),
                             tokenizer=TreebankWordTokenizer(),
                             batch_size=8,
                             num_workers=4,
                             has_category=False)
data_module.setup()

if not os.path.isfile(f"{GLOVE_DIR}{GLOVE_FILENAME}"):
    torchtext.vocab.GloVe(name="6B", dim=300, cache=GLOVE_DIR)

glove_embeddings = utils.load_pretrained_embeddings(GLOVE_FILENAME,
                                                    GLOVE_DIR,
                                                    data_module.vocabs["text"])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/andrea/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


INFO: Loading vectors from ../model/embeddings/glove/glove.6B.300d.txt.pt
INFO: Loading vectors from ../model/embeddings/glove/glove.6B.300d.txt.pt


initialised 7671 embeddings
randomly initialised 851 embeddings


## Model parameters

In [6]:
MODE = "ab"
BERT_PRETRAINED_PATH = (f"{MODELS_DIR}bert-base-cased"
                        if os.path.isdir(f"{MODELS_DIR}bert-base-cased")
                        else "bert-base-cased")

label_key = utils.get_label_key(MODE)

hparams = {"vocab_size": len(data_module.vocabs["text"]),
           "hidden_dim": 128,
           "embedding_dim": glove_embeddings.shape[1],
           "pos_embedding": False,
           "pos_embedding_dim": 120,
           "pos_vocab_size": len(data_module.vocabs["pos"]),
           "bert_embedding": True,
           "bert_finetuning": False,
           "bert_layers_to_merge": [-1, -2, -3, -4],
           "bert_layer_pooling_strategy": "mean",
           "bert_wordpiece_pooling_strategy": "mean",
           "bert_model_name_or_path": BERT_PRETRAINED_PATH,
           "pack_lstm_input": True,
           "label_vocab": data_module.vocabs[label_key],
           "num_classes": len(data_module.vocabs[label_key]),
           "bidirectional": True,
           "num_layers": 2,
           "dropout": 0.5,
           "max_epochs": 150,
           "attention": False,
           "attention_heads": 12,
           "attention_dropout": 0.2,
           "attention_concat": False,
           "attention_simple": False,
           "mode": MODE,
           "batch_size": data_module.batch_size}

if hparams["bidirectional"]: MODEL_NAME = "BiLSTM"
else: MODEL_NAME = "LSTM"

MODEL_NAME += " + GloVe"
if hparams["bert_embedding"]:
    MODEL_NAME += f" + BERT_{hparams['bert_layer_pooling_strategy']}"
    if hparams["bert_finetuning"]: MODEL_NAME += " finetuned"
if hparams["pos_embedding"]: MODEL_NAME += " + POS"
if hparams["attention"]:
    MODEL_NAME += " + attention"
    if not hparams["attention_simple"]: MODEL_NAME += " transformer"
    if hparams["attention_concat"]: MODEL_NAME += " + concat"

MODEL_NAME = f"{MODE}_{MODEL_NAME}"

print(MODEL_NAME)

## Training

In [None]:
early_stopping = pl.callbacks.EarlyStopping(monitor='valid_aspect_polarity_classification_f1',
                                            patience=10,
                                            verbose=True,
                                            mode='max')

check_point_callback = pl.callbacks.ModelCheckpoint(monitor='valid_aspect_polarity_classification_f1',
                                                    verbose=True,
                                                    save_top_k=2,
                                                    save_last=False,
                                                    mode='max',
                                                    dirpath=MODELS_DIR,
                                                    filename=MODEL_NAME + '-{epoch}-{valid_loss:.4f}-{valid_aspect_identification_f1:.3f}-{valid_aspect_polarity_classification_f1:.3f}')

wandb_logger = WandbLogger(offline=False, project=WANDB_PROJECT_AB, name=MODEL_NAME)

trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0,
                     logger=wandb_logger,
                     val_check_interval=1.0,
                     max_epochs=hparams["max_epochs"],
                     callbacks=[early_stopping, check_point_callback])

model = PlAspectClassifier(hparams,
                           embeddings=glove_embeddings,
                           ignore_index=const.PAD_INDEX)
trainer.fit(model, datamodule=data_module)

wandb.finish()

## Testing

In [8]:
model.freeze()

predictions = list()
for batch in data_module.val_dataloader():
    predictions += model.predict(batch)

import evaluate

if model.hparams.mode == "ab":
    print('MODEL: ASPECT SENTIMENT + ASPECT EXTRACTION\n')
    evaluate.evaluate_extraction(data_module.val_samples, predictions)
    print("_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n")
    evaluate.evaluate_sentiment(data_module.val_samples, predictions)
    print('-------------------------------------------------------\n')
elif model.hparams.mode == "cd":
    print('MODEL: CATEGORY SENTIMENT + CATEGORY EXTRACTION\n')
    evaluate.evaluate_sentiment(data_module.val_samples, predictions, 'Category Extraction')
    print("_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n")
    evaluate.evaluate_sentiment(data_module.val_samples, predictions, 'Category Sentiment')

  f"Attribute {k!r} is an instance of `nn.Module` and is already saved during checkpointing."


{'attention': False,
 'attention_concat': False,
 'attention_dropout': 0.2,
 'attention_heads': 12,
 'attention_simple': True,
 'augment_train': False,
 'batch_size': 8,
 'bert_embedding': True,
 'bert_finetuning': False,
 'bert_layer_pooling_strategy': 'second_to_last',
 'bert_layers_to_merge': [-1, -2, -3, -4],
 'bert_model_name_or_path': '../model/bert-base-cased',
 'bert_wordpiece_pooling_strategy': 'mean',
 'bidirectional': True,
 'dropout': 0.5,
 'embedding_dim': 300,
 'hidden_dim': 128,
 'label_vocab': Vocab(),
 'max_epochs': 150,
 'mode': 'ab',
 'num_classes': 10,
 'num_layers': 2,
 'pack_lstm_input': True,
 'polarity_vocab': None,
 'pos_embedding': False,
 'pos_embedding_dim': 120,
 'pos_vocab_size': 45,
 'vocab_size': 8522}
MODEL: ASPECT SENTIMENT + ASPECT EXTRACTION

Aspect Extraction Evaluation
	Aspects	 TP: 906;	FP: 230;	FN: 176
		precision: 79.75;	recall: 83.73;	f1: 81.70
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

Aspect Sentiment Evaluation

	ALL	 TP: 661;	