## Import Modules

In [108]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from tensorflow import keras

from keras.layers import Embedding, LSTM, Dense, Bidirectional, SpatialDropout1D
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from nltk.corpus import stopwords
from gensim.models import Word2Vec

## Import the datasets

In [109]:
dataset = load_dataset("climatebert/climate_sentiment")

In [110]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 320
    })
})

In [111]:
pd_train = pd.DataFrame.from_dict(dataset["train"])
pd_test = pd.DataFrame.from_dict(dataset["test"])

In [112]:
pd_train

Unnamed: 0,text,label
0,− Scope 3: Optional scope that includes indire...,1
1,The Group is not aware of any noise pollution ...,0
2,Global climate change could exacerbate certain...,0
3,Setting an investment horizon is part and parc...,0
4,Climate change the physical impacts of climate...,0
...,...,...
995,Greenhouse gas Mitigation Measures Our five ye...,1
996,We have updated our external sector statements...,1
997,STOREBRAND'S USE Task Force on Climate-related...,0
998,Estimations of nanced emissions indicate the i...,1


## RNN

### Data preprocessing

Cleaning the text, replacing symbol into space, remove symbols, and using stopwords.

In [173]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    
    return text

pd_train['text'] = pd_train['text'].apply(clean_text)
pd_train['text'] = pd_train['text'].str.replace('\d+', '')

pd_test['text'] = pd_test['text'].apply(clean_text)
pd_test['text'] = pd_test['text'].str.replace('\d+', '')

In [133]:
text_train = pd_train["text"].values
label_train = pd_train["label"].values

text_test = pd_test["text"].values
label_test = pd_test["label"].values

In [134]:
# define max words for the vocabulary
MAX_WORDS = 50000
tokenizer_train = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer_test = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

# fit dataset to tokenizer
tokenizer_train.fit_on_texts(text_train)
tokenizer_test.fit_on_texts(text_test)

# convert dataset to sequence of integer
seq_train = tokenizer_train.texts_to_sequences(text_train)
seq_test = tokenizer_test.texts_to_sequences(text_test)

In [135]:
# pad the sequence to fixed_length, will adjust later
MAX_SEQ = 100
X_train = pad_sequences(sequences=seq_train, maxlen=MAX_SEQ)
X_test = pad_sequences(sequences=seq_test, maxlen=MAX_SEQ)

### Split the data

In [174]:
# turn the lables into categorical
y_train = to_categorical(label_train, 3)
y_test = to_categorical(label_test, 3)

Datasets already split, so will use validation.

In [137]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42069)

### Create the RNN Model using Randomly Initialized Embedding

In [142]:
# create sequential model to stack layers
rnn = Sequential()

# embedding layer to convert integer tokens into dense vectors
# weights not assigned, will use randmoly initialized
rnn.add(Embedding(input_dim=MAX_WORDS, output_dim=100, input_length=X_train.shape[1]))

# performs variational dropout in NLP models
rnn.add(SpatialDropout1D(rate=0.2))

# bidirectional with 100 unit
# process sequence in both direction, it's said to capture context efficiently
rnn.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))

# add final layer of 50 unit
rnn.add(Bidirectional(LSTM(50)))

# add dense layer, with 3 output and softmax activation (used for multiclass)
rnn.add(Dense(3, activation="softmax"))

# compile the RNN model
rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [164]:
# train the model
rnn_history = rnn.fit(X_train, y_train, epochs=8, batch_size=16, validation_data=(X_val, y_val))

Epoch 1/8


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [175]:
# evaluate the models
loss, accuracy = rnn.evaluate(X_val, y_val)
print(f"Loss:\t{loss:.4f}")
print(f"Accuracy:\t{accuracy:.4f}")

Loss:	1.3703
Accuracy:	0.7000


In [176]:
# evaluate the models
loss, accuracy = rnn.evaluate(X_test, y_test)
print(f"Loss:\t{loss:.4f}")
print(f"Accuracy:\t{accuracy:.4f}")

Loss:	3.4570
Accuracy:	0.3625


The accuracy is "meh", when using test data. Already tried to adjust the layers used, but still couldn't find models that produce better accuracy.

### Create the RNN models using Word2Vec embedding

Create the embedding matrix using Word2Vec

In [156]:
w2v = Word2Vec(sentences=text_train, vector_size=100, window=5, min_count=1, sg=0)
w2v.save("word2vec.model")

In [157]:
embedding_matrix = np.zeros((MAX_WORDS, 100))
for word, i in tokenizer_train.word_index.items():
    if i < MAX_WORDS:
        if word in w2v.wv:
            embedding_matrix[i] = w2v.wv[word]

In [158]:
embedding_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [159]:
# create sequential model to stack layers
rnn_w2v = Sequential()

# embedding layer to convert integer tokens into dense vectors
# change the weight to embedding_matrix from Word2Vec
rnn_w2v.add(Embedding(input_dim=MAX_WORDS, output_dim=100, input_length=X_train.shape[1], weights=[embedding_matrix], trainable=True))

# performs variational dropout in NLP models
rnn_w2v.add(SpatialDropout1D(rate=0.2))

# bidirectional with 100 unit
# process sequence in both direction, it's said to capture context efficiently
rnn_w2v.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))

# add final layer of 50 unit
rnn_w2v.add(Bidirectional(LSTM(50)))

# add dense layer, with 3 output and softmax activation (used for multiclass)
rnn_w2v.add(Dense(3, activation="softmax"))

# compile the RNN model
rnn_w2v.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [162]:
# train the model
rnn_w2v_history = rnn_w2v.fit(X_train, y_train, epochs=8, batch_size=16, validation_data=(X_val, y_val))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [177]:
# evaluate the models
loss, accuracy = rnn.evaluate(X_val, y_val)
print(f"Loss:\t{loss:.4f}")
print(f"Accuracy:\t{accuracy:.4f}")

Loss:	1.3703
Accuracy:	0.7000


In [178]:
# evaluate the models
loss, accuracy = rnn_w2v.evaluate(X_test, y_test)
print(f"Loss:\t{loss:.4f}")
print(f"Accuracy:\t{accuracy:.4f}")

Loss:	3.5895
Accuracy:	0.4406


The accuracy is a bit better compared to randomly initialized, but still not good enough.

### Predict the models using two embedding

In [171]:
# using randomly initialized embedding
predictions = rnn.predict(X_test[:5])

for text, prediction, groundtruth in zip(tokenizer_test.sequences_to_texts(X_test), predictions, y_test[:5]):
    pred = prediction.tolist()
    groundtruth = groundtruth.tolist()
    print(f"Text: {text}\nPredicted: {pred.index(max(pred))}\nGroundtruth: {groundtruth.index(max(groundtruth))}\n")

Text: sustainable strategy red lines sustainable strategy range incorporate series proprietary red lines order ensure poorest performing companies esg perspective eligible investment
Predicted: 1
Groundtruth: 0

Text: verizons environmental health safety management system provides framework identifying controlling reducing risks associated environments operate besides regular management system assessments internal thirdparty compliance audits inspections performed annually hundreds facilities worldwide goal assessments identify correct sitespecific issues educate empower facility managers supervisors implement corrective actions verizons environment health safety efforts directed supported eperienced eperts around world support operations facilities
Predicted: 1
Groundtruth: 1

Text: 2019 company closed series transactions related sale canadian fossil fuelbased electricity generation business transaction heartland generation ltd affiliate energy capital partners included sale 10 partly

In [172]:
# using Word2Vec embedding
predictions = rnn_w2v.predict(X_test[:5])

for text, prediction, groundtruth in zip(tokenizer_test.sequences_to_texts(X_test), predictions, y_test[:5]):
    pred = prediction.tolist()
    groundtruth = groundtruth.tolist()
    print(f"Text: {text}\nPredicted: {pred.index(max(pred))}\nGroundtruth: {groundtruth.index(max(groundtruth))}\n")

Text: sustainable strategy red lines sustainable strategy range incorporate series proprietary red lines order ensure poorest performing companies esg perspective eligible investment
Predicted: 1
Groundtruth: 0

Text: verizons environmental health safety management system provides framework identifying controlling reducing risks associated environments operate besides regular management system assessments internal thirdparty compliance audits inspections performed annually hundreds facilities worldwide goal assessments identify correct sitespecific issues educate empower facility managers supervisors implement corrective actions verizons environment health safety efforts directed supported eperienced eperts around world support operations facilities
Predicted: 1
Groundtruth: 1

Text: 2019 company closed series transactions related sale canadian fossil fuelbased electricity generation business transaction heartland generation ltd affiliate energy capital partners included sale 10 partly

## Summary

While randomly initialized embedding produce worst result than Word2Vector, both of the embedding couldn't produce a good enough results.

### Randomly Initalized
Accuracy: 36.25%

### Word2Vec
Accuracy: 44.06%


_*both accuracy using different datasets than training._