# Reference

*   Original Tutorial: http://mlexplained.com/2019/01/30/an-in-depth-tutorial-to-allennlp-from-basics-to-elmo-and-bert/
*   Original Colab: https://colab.research.google.com/github/dudeperf3ct/DL_notebooks/blob/master/tl_nlp/tl_nlp_allennlp.ipynb#scrollTo=pDBy4oPBOftc
*   Modified ELMO Colab for text classification: https://github.com/keitakurita/Practical_NLP_in_PyTorch/blob/master/allennlp/elmo_text_classification.ipynb

#### Other references:

*   http://www.realworldnlpbook.com/blog/improving-sentiment-analyzer-using-elmo.html
*   https://dudeperf3ct.github.io/nlp/transfer/learning/2019/02/22/Power-of-Transfer-Learning-in-NLP/#elmo
*   https://allennlp.org/tutorials


In [None]:
#!pip install -q allennlp==0.8.2

import allennlp
allennlp.__version__

'0.8.2'

In [None]:
import spacy
spacy.__version__

'2.0.18'

In [None]:
import torch
torch.__version__

'1.6.0+cu101'

In [None]:
!nvidia-smi

Mon Aug 31 11:28:40 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   67C    P8    11W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
torch.cuda.set_device(0)

In [None]:
# https://spacy.io/models/pt

! python -m spacy download pt
! python -m spacy download pt_core_news_sm

cur_language = 'pt_core_news_sm'


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/pt_core_news_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/pt

    You can now load the model via spacy.load('pt')


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/pt_core_news_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/pt_core_news_sm

    You can now load the model via spacy.load('pt_core_news_sm')



## e-SIC Dataset

Code Adapted from : [Link](https://github.com/keitakurita/Practical_NLP_in_PyTorch)

Paper ELMo : [Link](https://arxiv.org/pdf/1802.05365.pdf)


In [None]:
from pathlib import Path
from typing import *
import os
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from functools import partial
from overrides import overrides

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util
from allennlp.common.checks import ConfigurationError

from datetime import datetime

USE_GPU = torch.cuda.is_available()
print(USE_GPU)

True


In [None]:
import pandas as pd

#Variables
params = {
    'exp': 'Clareza-Balanced-Multiclass-COH-METRIX',
    'data': 'resp-text',
    'label': 'Clareza',
    'BATCH_SIZE': 64,
    'MAX_LEN': 128,
    'lr': 3e-4,
    'epochs': 10,
    'hidden_sz': 64,
    'max_vocab_size': 10000
}

In [None]:
from google.colab import drive

# Mount Driver
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Load Data
binary = False
exp = params.get('exp')
if 'Binary' in exp:
  binary = True

base_path = '/content/gdrive/My Drive/Colab Notebooks/Simple/Datasets/' + exp
save_path = '/content/gdrive/My Drive/Colab Notebooks/Simple/' + exp + '/output/'

data = params.get('data')
label = params.get('label')

x_train_file = 'X_train.csv'
y_train_file = 'y_train.csv'
x_test_file = 'X_test.csv'
y_test_file = 'y_test.csv'

#Load data
X_train = pd.read_csv(os.path.join(base_path, x_train_file), sep=';', encoding='utf-8')
y_train = pd.read_csv(os.path.join(base_path, y_train_file), sep=';', encoding='utf-8')
X_test = pd.read_csv(os.path.join(base_path, x_test_file), sep=';', encoding='utf-8')
y_test = pd.read_csv(os.path.join(base_path, y_test_file), sep=';', encoding='utf-8')

In [None]:
#df = pd.concat([X_train, X_test], axis=0)
train_df = pd.concat([X_train.loc[:,['pid', data]], pd.DataFrame(y_train[label], columns=[label])], axis=1)
test_df = pd.concat([X_test.loc[:,['pid', data]], pd.DataFrame(y_test[label], columns=[label])], axis=1)

train_df['positive'] = train_df.apply(lambda row: 1 if row[label] == 2 else 0, axis=1)
train_df['negative'] = train_df.apply(lambda row: 1 if row[label] == 0 else 0, axis=1)
train_df['neutral'] = train_df.apply(lambda row: 1 if row[label] == 1 else 0, axis=1)

test_df['positive'] = test_df.apply(lambda row: 1 if row[label] == 2 else 0, axis=1)
test_df['negative'] = test_df.apply(lambda row: 1 if row[label] == 0 else 0, axis=1)
test_df['neutral'] = test_df.apply(lambda row: 1 if row[label] == 1 else 0, axis=1)

train_df.head(10)

Unnamed: 0,pid,resp-text,Clareza,positive,negative,neutral
0,545719,Esclarecemos que a Ebserh disponibiliza inform...,1,0,0,1
1,558682,"Prezado a Senhor a , Segue anexa , resposta ao...",1,0,0,1
2,555475,"Prezado senhor , não há , no momento , nenhuma...",2,1,0,0
3,550699,"Prezado Welington , O Instituto Federal agrade...",1,0,0,1
4,359635,"Prezada Sra . Vivian , O DEPARTAMENTO DE ASSI...",1,0,0,1
5,551514,"Prezado senhor , sobre o assunto , deve-se obs...",0,0,1,0
6,528654,"Prezado senhor Allan de Oliveira Barros , O úl...",0,0,1,0
7,506126,"Prezado a Senhor a Deuseni , informamos que nã...",0,0,1,0
8,421000,"Prezada Senhora , Em atendimento à solicitação...",2,1,0,0
9,293338,"Prezada Sra, Em atenção à solicitação de V.Sª....",2,1,0,0


In [None]:
test_df.head(10)

Unnamed: 0,pid,resp-text,Clareza,positive,negative,neutral
0,312962,Prezado (a) Senhor (a) 1. Em atenção ao pedido...,2,1,0,0
1,438696,O Serviço de Informações ao Cidadão SIC da Agê...,0,0,1,0
2,465158,"Prezado a Cidadão ã , 1 . Conforme solicitação...",1,0,0,1
3,568962,"Prezado , O Serviço de Informações ao Cidadão...",1,0,0,1
4,466115,"Prezado a Senhor a , Em atendimento ao pedido ...",1,0,0,1
5,326588,"Prezado Senhor, A sua manifestação não encontr...",0,0,1,0
6,440392,"Prezado Cidadão , Temos a esclarecer que receb...",0,0,1,0
7,395222,"Senhor João , O Serviço de Informações ao Cida...",2,1,0,0
8,506342,"Bom dia , Maiara ! A Universidade Federal de V...",0,0,1,0
9,366116,"Prezado a Senhor a , Esclarecemos que por meio...",2,1,0,0


In [None]:
try:
  os.mkdir('data/')
except:
  pass

train_df.to_csv('data/train_df.csv', index=False)
test_df.to_csv('data/test_df.csv', index=False)

In [None]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=False,
    seed=1,
    batch_size=params.get('BATCH_SIZE'),
    lr=params.get('lr'),
    epochs=params.get('epochs'),
    hidden_sz=params.get('hidden_sz'),
    max_seq_len=params.get('MAX_LEN'), # required to limit memory usage
    max_vocab_size=params.get('max_vocab_size'),
)

In [None]:
torch.manual_seed(config.seed)
DATA_ROOT = Path("data")

### Prepare Dataset

In [None]:
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader

In [None]:
label_cols = ["negative", "neutral", "positive"]

In [None]:
from allennlp.data.fields import TextField, MetadataField, ArrayField

class SentimentDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_len: Optional[int]=config.max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token], id: str=None, labels: np.ndarray=None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}
        
        id_field = MetadataField(id)
        fields["id"] = id_field
        
        if labels is None:
            labels = np.zeros(len(label_cols))
        label_field = ArrayField(array=labels)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path)
        if config.testing: df = df.head(1000)
        for i, row in df.iterrows():
            yield self.text_to_instance([Token(x) for x in self.tokenizer(row[data])], None, row[label_cols].values)

## ELMo

In [None]:
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer

# the token indexer is responsible for mapping tokens to integers
token_indexer = ELMoTokenCharactersIndexer()

def tokenizer(x: str):
    return [w.text for w in SpacyWordSplitter(language=cur_language, pos_tags=False).split_words(x)[:config.max_seq_len]]

In [None]:
reader = SentimentDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

In [None]:
train_ds = reader.read(DATA_ROOT / "train_df.csv")
print(len(train_ds))

4832it [00:15, 312.27it/s]

4832





In [None]:
test_ds = reader.read(DATA_ROOT / "test_df.csv")
print(len(test_ds))

2071it [00:05, 362.56it/s]

2071





### Checking Tokens & Labels

In [None]:
vars(train_ds[0].fields["tokens"])

{'_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None,
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.elmo_indexer.ELMoTokenCharactersIndexer at 0x7f8b88c4fcc0>},
 'tokens': [Esclarecemos,
  que,
  a,
  Ebserh,
  disponibiliza,
  informações,
  em,
  conformidade,
  com,
  o,
  disposto,
  naPortaria,
  Conjunta,
  número,
  5,
  da,
  Secretaria,
  de,
  Orçamento,
  Federal,
  -,
  SOF,
  e,
  da,
  Secretaria,
  de,
  Gestão,
  Pública,
  -,
  SEGEP,
  ,,
  de,
  5,
  de,
  agosto,
  de,
  2015,
  ,,
  por,
  meio,
  da,
  publicação,
  ,,
  desde,
  outubro,
  de,
  2015,
  ,,
  no,
  endereço,
  eletrônico,
  http,
  :,
  www,
  .,
  ebserh,
  .,
  governo,
  .,
  br,
  web,
  portal,
  -,
  ebserh,
  empregados,
  ,,
  das,
  seguintes,
  Tabelas,
  :,
  I,
  -,
  Quantitativo,
  Físico,
  de,
  Empregados,
  Cargos,
  Efetivos,
  2,
  -,
  Remuneração,
  de,
  Cargos,
  Efetivos,
  3,
  -,
  Quantitativo,
  Físico,
  de,
  Cargos,
  em,
  Comissão,
  e,

In [None]:
vars(train_ds[0].fields["label"])

{'array': array([0, 1, 0], dtype=object), 'padding_value': 0}

### Prepare Vocabulary

In [None]:
vocab = Vocabulary()

### Prepare Iterator

In [None]:
from allennlp.data.iterators import BucketIterator

iterator = BucketIterator(batch_size=config.batch_size, sorting_keys=[("tokens", "num_tokens")],)
iterator.index_with(vocab)

### Prepare Model

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder



In [None]:
class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int=len(label_cols)):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = nn.BCEWithLogitsLoss()
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                id: Any, label: torch.Tensor) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        
        output = {"class_logits": class_logits}
        output["loss"] = self.loss(class_logits, label)

        return output

### Prepare Embeddings

In [None]:
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder

# https://allennlp.org/elmo - PORTUGUESE BRWAC
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/brwac/options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/brwac/elmo_pt_weights_dgx1.hdf5'

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

In [None]:
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper

# Create Encoder
encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_sz, bidirectional=True, batch_first=True))

In [None]:
model = BaselineModel(word_embeddings, encoder)

if USE_GPU: model.cuda()
else: model

### Train

In [None]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [None]:
from allennlp.training.trainer import Trainer

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_ds,
    cuda_device=0 if USE_GPU else -1,
    num_epochs=config.epochs,
)

In [None]:
metrics = trainer.train()

loss: 0.6448 ||: 100%|██████████| 76/76 [01:00<00:00,  1.27it/s]
loss: 0.6339 ||: 100%|██████████| 76/76 [01:02<00:00,  1.21it/s]
loss: 0.6305 ||: 100%|██████████| 76/76 [01:07<00:00,  1.12it/s]
loss: 0.6246 ||: 100%|██████████| 76/76 [01:07<00:00,  1.12it/s]
loss: 0.6220 ||: 100%|██████████| 76/76 [01:07<00:00,  1.12it/s]
loss: 0.6180 ||: 100%|██████████| 76/76 [01:07<00:00,  1.12it/s]
loss: 0.6123 ||: 100%|██████████| 76/76 [01:07<00:00,  1.12it/s]
loss: 0.6080 ||: 100%|██████████| 76/76 [01:07<00:00,  1.12it/s]
loss: 0.6013 ||: 100%|██████████| 76/76 [01:08<00:00,  1.12it/s]
loss: 0.5969 ||: 100%|██████████| 76/76 [01:07<00:00,  1.12it/s]


### Predictions

In [None]:
from allennlp.data.iterators import DataIterator
from tqdm import tqdm
from scipy.special import expit # the sigmoid function

def tonp(tsr): return tsr.detach().cpu().numpy()

class Predictor:
    def __init__(self, model: Model, iterator: DataIterator,
                 cuda_device: int=-1) -> None:
        self.model = model
        self.iterator = iterator
        self.cuda_device = cuda_device
        
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return expit(tonp(out_dict["class_logits"]))
    
    def predict(self, ds: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)

In [None]:
from allennlp.data.iterators import BasicIterator

# iterate over the dataset without changing its order
seq_iterator = BasicIterator(batch_size=64)
seq_iterator.index_with(vocab)

In [None]:
predictor = Predictor(model, seq_iterator, cuda_device=0 if USE_GPU else -1)
test_preds = predictor.predict(test_ds)

100%|██████████| 33/33 [00:34<00:00,  1.06s/it]


In [None]:
# Convert to predictions
y_pred_bool = np.argmax(test_preds, axis=1)
y_pred_bool[0:10]

array([0, 1, 0, 1, 1, 0, 0, 2, 2, 1])

In [None]:
from sklearn.metrics import f1_score, classification_report

test_y = test_df[label]

f1 = f1_score(test_y, y_pred_bool, average='weighted')
print(f"Best Test F1-Score: {f1:.3f}")    
print(classification_report(test_y, y_pred_bool))

Best Test F1-Score: 0.397
              precision    recall  f1-score   support

           0       0.39      0.41      0.40       673
           1       0.37      0.34      0.35       678
           2       0.43      0.44      0.43       720

    accuracy                           0.40      2071
   macro avg       0.40      0.40      0.40      2071
weighted avg       0.40      0.40      0.40      2071



In [None]:
now = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = '/content/gdrive/My Drive/Colab Notebooks/outputs/elmo/'  + params.get('exp') + '/' + now 

try:
  os.mkdir(output_dir)
except:
  pass

from json import dumps

with open(output_dir + '/params.json', 'w') as f:
  f.write(dumps(params))

In [None]:
with open(output_dir + "/model.th", 'wb') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files(output_dir + "/vocabulary")

In [None]:
with open(output_dir + '/classification_report.txt', 'w') as f:
  f.write(classification_report(test_y, y_pred_bool))