<a href="https://colab.research.google.com/github/andrea-gasparini/nlp-aspect-based-sentiment-analysis/blob/master/hw2/stud/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aspect-Based Sentiment Analysis (ABSA)

Second homework of the Natural Language Processing course 2021 @ Sapienza University of Rome.

Prof. Roberto Navigli

MSc in Computer Science

**Author**: Andrea Gasparini - 1813486

## Setup the environment

Note that this notebook can be directly executed in Colab by clicking the button above.

Otherwise, it is supposed to be placed in the `nlp2021-hw2/hw2` directory before running it.

In [1]:
import os
import sys

# if running on colab
if 'google.colab' in sys.modules:

	# set up GitHub
	GITHUB_TOKEN = ""
	GITHUB_USER = "andrea-gasparini"
	assert GITHUB_TOKEN != "" and GITHUB_USER != ""

	# clone the repository from GitHub
	! git clone https://{GITHUB_USER}:{GITHUB_TOKEN}@github.com/andrea-gasparini/nlp-aspect-based-sentiment-analysis
	! mv nlp-aspect-based-sentiment-analysis/hw2/* .
	! mv nlp-aspect-based-sentiment-analysis/requirements.txt .
	! rm -rf nlp-aspect-based-sentiment-analysis
	! pip install -r requirements.txt --quiet --no-cache-dir

	# mount drive directories
	from google.colab import drive
	drive.mount('/content/drive', force_remount=True)

	ROOT_DIR = '/content/drive/My Drive/data/nlp/hw2/'
	DATA_DIR = f'{ROOT_DIR}data/'
	MODELS_DIR = f'{ROOT_DIR}models/'
	EMBEDDINGS_DIR = f'{ROOT_DIR}embeddings/'

else:

	VALID_RELATIVE_CWD = os.path.join("nlp2021-hw2", "hw2")
	relative_cwd_last_two = os.path.sep.join(os.getcwd().split(os.path.sep)[-2:])
	assert relative_cwd_last_two == VALID_RELATIVE_CWD,\
		f"This notebook is supposed to be runned only from {VALID_RELATIVE_CWD} or Google Colab"

	ROOT_DIR = '../'
	DATA_DIR = f'{ROOT_DIR}data/'
	MODELS_DIR = f'{ROOT_DIR}model/'
	EMBEDDINGS_DIR = f'{MODELS_DIR}embeddings/'

assert os.path.isdir(ROOT_DIR), f"{ROOT_DIR} is not a valid directory"

for dir in [DATA_DIR, MODELS_DIR, EMBEDDINGS_DIR]:
    ! mkdir -p {dir.replace(' ', '\ ')}

## Laptop datasets
LAPTOP_TRAIN_JSON = os.path.join(DATA_DIR, "laptops_train.json")
LAPTOP_DEV_JSON = os.path.join(DATA_DIR, "laptops_dev.json")
assert os.path.isfile(LAPTOP_TRAIN_JSON), f"{LAPTOP_TRAIN_JSON} does not contain a valid train dataset"
assert os.path.isfile(LAPTOP_DEV_JSON), f"{LAPTOP_DEV_JSON} does not contain a valid development dataset"

## Restaurant datasets
RESTAURANT_TRAIN_JSON = os.path.join(DATA_DIR, "restaurants_train.json")
RESTAURANT_DEV_JSON = os.path.join(DATA_DIR, "restaurants_dev.json")
assert os.path.isfile(RESTAURANT_TRAIN_JSON), f"{RESTAURANT_TRAIN_JSON} does not contain a valid train dataset"
assert os.path.isfile(RESTAURANT_DEV_JSON), f"{RESTAURANT_DEV_JSON} does not contain a valid development dataset"

### Imports

In [2]:
import os
import wandb
import torch
import pytorch_lightning as pl

from evaluate import read_dataset

from nltk import TreebankWordTokenizer
from pytorch_lightning.loggers import WandbLogger

from stud.dataset import ABSADataModule
from stud.pl_models import PlAspectTermsClassifier
from stud import utils

### Reproducibility stuff

In [3]:
SEED = 42 # @param {type:"integer"}

pl.seed_everything(SEED)
torch.backends.cudnn.deterministic = True  # will use only deterministic algorithms

Global seed set to 42


## Wandb logging

In [4]:
WANDB_PROJECT_AB = 'nlp_hw2-AB'
WANDB_PROJECT_CD = 'nlp_hw2-CD'
WANDB_KEY=""
wandb.login(key=WANDB_KEY)

[34m[1mwandb[0m: Currently logged in as: [33mandreagasparini[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/andrea/.netrc


True

## Data preparation

In [None]:
AUGMENT_TRAIN = False

utils.nltk_downloads()

data_module = ABSADataModule(read_dataset(LAPTOP_TRAIN_JSON) + read_dataset(RESTAURANT_TRAIN_JSON),
                             read_dataset(LAPTOP_DEV_JSON) + read_dataset(RESTAURANT_DEV_JSON),
                             tokenizer=TreebankWordTokenizer(),
                             augment_train=AUGMENT_TRAIN)
data_module.setup()

glove_embeddings = utils.load_pretrained_embeddings("glove.6B.300d.txt",
                                              f"{EMBEDDINGS_DIR}glove/",
                                              data_module.vocabs["text"])

## Model parameters

In [None]:
MODE = "ab"
BERT_PRETRAINED_PATH = (f"{MODELS_DIR}bert-base-cased"
                        if os.path.isdir(f"{MODELS_DIR}bert-base-cased")
                        else "bert-base-cased")

label_key = utils.get_label_key(MODE)

hparams = {"vocab_size": len(data_module.vocabs["text"]),
           "hidden_dim": 128,
           "embedding_dim": glove_embeddings.shape[1],
           "augment_train": AUGMENT_TRAIN,
           "pos_embedding": False,

           "bert_embedding": True,
           "bert_finetuning": False,
           "bert_layer_pooling_strategy": "concat",
           "bert_wordpiece_pooling_strategy": "mean",
           "bert_model_name_or_path": BERT_PRETRAINED_PATH,
           "pack_lstm_input": True,
           "label_vocab": data_module.vocabs[label_key],
           "num_classes": len(data_module.vocabs[label_key]),
           "bidirectional": True,
           "num_layers": 2,
           "dropout": 0.5,
           "max_epochs": 150,
           "attention": False,
           "attention_heads": 2,
           "attention_dropout": 0.5,
           "mode": MODE}

if not hparams["bert_embedding"]: MODEL_NAME = "Baseline"
else:
    MODEL_NAME = f"BERT_{hparams['bert_layer_pooling_strategy']}"
    if hparams["bert_finetuning"]: MODEL_NAME += "_finetuned"
if hparams["attention"]: MODEL_NAME += "_attention"
if hparams["augment_train"]: MODEL_NAME += "_augmented"

PATH = f"{MODELS_DIR}{MODEL_NAME}.ckpt"

MODEL_NAME += "_batch_first"
print(MODEL_NAME)

## Training

In [None]:
early_stopping = pl.callbacks.EarlyStopping(monitor='valid_aspect_sentiment_evaluation_f1',
                                            patience=10,
                                            verbose=True,
                                            mode='max')

check_point_callback = pl.callbacks.ModelCheckpoint(monitor='valid_aspect_sentiment_evaluation_f1',
                                                    verbose=True,
                                                    save_top_k=2,
                                                    save_last=False,
                                                    mode='max',
                                                    dirpath=MODELS_DIR,
                                                    filename=MODEL_NAME + '-{epoch}-{valid_loss:.4f}-{valid_aspect_sentiment_extraction_f1:.3f}-{valid_aspect_sentiment_evaluation_f1:.3f}')

wandb_logger = WandbLogger(offline=False, project=WANDB_PROJECT_AB, name=MODEL_NAME)

trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0,
                        logger=wandb_logger,
                        val_check_interval=1.0,
                        max_epochs=hparams["max_epochs"],
                        callbacks=[early_stopping, check_point_callback])

model = PlAspectTermsClassifier(hparams,
                                embeddings=glove_embeddings,
                                ignore_index=data_module.vocabs[label_key]["[PAD]"])
trainer.fit(model, datamodule=data_module)