## Import Modules

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import torch
import re
import datasets
import evaluate
import accelerate

from datasets import load_dataset
from tensorflow import keras

from keras.layers import Embedding, LSTM, Dense, Bidirectional, SpatialDropout1D, BatchNormalization, Dropout
from keras.models import Sequential
from keras.optimizers.legacy import Adam
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.utils import to_categorical, pad_sequences

from nltk.corpus import stopwords
from gensim.models import Word2Vec

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


## Import the datasets

In [2]:
dataset = load_dataset("climatebert/climate_sentiment")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 320
    })
})

## Attention-Based for Text Classification

### Data pre-processing

In [4]:
text_train = dataset["train"]["text"]
label_train = dataset["train"]["label"]

text_test = dataset["test"]["text"]
label_test = dataset["test"]["label"]

In [5]:
TEST_SIZE = 0.5

_, train_set = dataset["train"].train_test_split(test_size=TEST_SIZE).values()
_, test_set = dataset["test"].train_test_split(test_size=TEST_SIZE).values()

mini_ds = datasets.DatasetDict(
    {
        "train": train_set,
        "test" : test_set
    }
)

In [6]:
mini_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 500
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 160
    })
})

### Prepare the datasets for RNN-Based Models (to compare with BERT-Based)

In [7]:
# define max words for the vocabulary
MAX_WORDS = 10000
tokenizer_train = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer_test = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

# fit dataset to tokenizer
tokenizer_train.fit_on_texts(text_train)
tokenizer_test.fit_on_texts(text_test)

# convert dataset to sequence of integer
seq_train = tokenizer_train.texts_to_sequences(text_train)
seq_test = tokenizer_test.texts_to_sequences(text_test)

# pad the sequence to fixed_length, will adjust later
MAX_SEQ = 250
X_train = pad_sequences(sequences=seq_train, maxlen=MAX_SEQ)
X_test = pad_sequences(sequences=seq_test, maxlen=MAX_SEQ)

# turn the lables into categorical
y_train = to_categorical(label_train, 3)
y_test = to_categorical(label_test, 3)

### Tokenize dataset for the BERT-Based model

In [8]:
# create tokenizer from BERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(ds):
    return tokenizer(ds["text"], padding="max_length", truncation=True)

# tokenize the dataset
tokenized_ds = mini_ds.map(tokenize, batched=True)

Map: 100%|██████████| 500/500 [00:00<00:00, 3681.76 examples/s]
Map: 100%|██████████| 160/160 [00:00<00:00, 3218.84 examples/s]


### Create RNN-Based Model

In [9]:
EMBEDDING_DIM = 128
w2v = Word2Vec(sentences=text_train, vector_size=EMBEDDING_DIM, window=5, min_count=1, sg=0)

In [10]:
embedding_matrix = np.zeros((MAX_WORDS, EMBEDDING_DIM))
for word, i in tokenizer_train.word_index.items():
    if i < MAX_WORDS:
        if word in w2v.wv:
            embedding_matrix[i] = w2v.wv[word]

In [11]:
embedding_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [12]:
# create sequential model to stack layers
rnn_w2v = Sequential()

# embedding layer to convert integer tokens into dense vectors
# change the weight to embedding_matrix from Word2Vec
rnn_w2v.add(Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=X_train.shape[1], weights=[embedding_matrix], trainable=True))

# performs variational dropout in NLP models
rnn_w2v.add(SpatialDropout1D(rate=0.2))

# Bidirectional LSTM layers with dropout and recurrent dropout
rnn_w2v.add(Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
rnn_w2v.add(Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)))

# Batch normalization layer
rnn_w2v.add(BatchNormalization())

# Dense layer with ReLU activation
rnn_w2v.add(Dense(64, activation="relu"))

# Dropout layer
rnn_w2v.add(Dropout(0.5))

# add dense layer, with 3 output and softmax activation (used for multiclass)
rnn_w2v.add(Dense(3, activation="softmax"))

# Learning rate reduction on plateau
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001)

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Compile the RNN model with custom optimizer (Adam) and loss function (categorical_crossentropy)
optimizer = Adam(learning_rate=0.001)
rnn_w2v.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [13]:
# train the model
rnn_w2v_history = rnn_w2v.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [14]:
# evaluate the models
loss, accuracy = rnn_w2v.evaluate(X_test, y_test)
print(f"Loss:\t{loss:.4f}")
print(f"Accuracy:\t{accuracy:.4f}")

Loss:	2.5619
Accuracy:	0.3063


### Create a BERT-Based model

In [15]:
bert_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    pred = np.argmax(logits, axis=-1)
    return metric.compute(predictions=pred, references=labels)

In [17]:
bert_training_args = TrainingArguments(output_dir="trainer", evaluation_strategy="epoch")

bert_trainer = Trainer(
    model= bert_model,
    args= bert_training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    compute_metrics=compute_metrics,
)

In [18]:
bert_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.70822,0.6625
2,No log,0.704951,0.7375
3,No log,0.755466,0.75625


TrainOutput(global_step=189, training_loss=0.4463630151496362, metrics={'train_runtime': 1743.4434, 'train_samples_per_second': 0.86, 'train_steps_per_second': 0.108, 'total_flos': 198704641536000.0, 'train_loss': 0.4463630151496362, 'epoch': 3.0})

In [19]:
bert_result = bert_trainer.evaluate()

In [20]:
bert_result

{'eval_loss': 0.7554655075073242,
 'eval_accuracy': 0.75625,
 'eval_runtime': 74.139,
 'eval_samples_per_second': 2.158,
 'eval_steps_per_second': 0.27,
 'epoch': 3.0}

### Comparing RNN-Based to BERT-Based

In [21]:
PREDICT_SIZE = 10

In [22]:
# using Word2Vec embedding
predictions = rnn_w2v.predict(X_test[:PREDICT_SIZE])
score = 0

for text, prediction, groundtruth in zip(tokenizer_test.sequences_to_texts(X_test), predictions, y_test[:5]):
    pred = prediction.tolist()
    pred = pred.index(max(pred))

    groundtruth = groundtruth.tolist()
    groundtruth = groundtruth.index(max(groundtruth))

    if (groundtruth == pred):
        score += 1

    print(f"Text: {text}\Groundtruth: {groundtruth}\nPredicted: {pred}\n")

print(f"Score: {score}/{PREDICT_SIZE}")

Text: sustainable strategy ‘red lines’ for our sustainable strategy range we incorporate a series of proprietary ‘red lines’ in order to ensure the poorest performing companies from an esg perspective are not eligible for investment\Groundtruth: 0
Predicted: 0

Text: verizon’s environmental health and safety management system provides a framework for identifying controlling and reducing the risks associated with the environments in which we operate besides regular management system assessments internal and third party compliance audits and inspections are performed annually at hundreds of facilities worldwide the goal of these assessments is to identify and correct site specific issues and to educate and empower facility managers and supervisors to implement corrective actions verizon’s environment health and safety efforts are directed and supported by experienced experts around the world that support our operations and facilities\Groundtruth: 1
Predicted: 0

Text: in 2019 the company

In [23]:
result = bert_trainer.predict(tokenized_ds["test"])
score = 0

for i in range(PREDICT_SIZE):
    groundtruth = np.argmax(result[0][i])
    pred = tokenized_ds['test']['label'][i]

    if (groundtruth == pred):
        score += 1

    print(f"Text: {tokenized_ds['test']['text'][i]}")
    print(f"Groundtruth: {groundtruth}")
    print(f"Prediction: {pred}\n")

print(f"Score: {score}/{PREDICT_SIZE}")

Text: BB DTVM, by strategic direction, performs extensive asset screening considering socioenvironmental and corporate governance aspects. By means of its own ESG methodology, which uses a combination of positive and negative screening, it ended the period with R$ 648.85 billion in assets subject to this methodology, representing 55.12% of the total assets under management. The asset manager has been managing and distributing 10 investment funds with socioenvironmental characteristics to the different investor segments, that, in December 2020, totaled R$ 3.15 billion in shareholders’ equity.
Groundtruth: 1
Prediction: 1

Text: The 2019 Integrated Report is the eighth Atlantia’s annual integrated report, prepared based on the International Framework set out by the International Integrated Reporting Council (www.theiirc.org/international-ir-framework/) and drafted in accordance with the GRI Sustainability Reporting Standards published in 2016 by GRI – Global Reporting Initiative, accordi