## Import Modules

In [108]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from tensorflow import keras

from keras.layers import Embedding, LSTM, Dense, Bidirectional, SpatialDropout1D
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from nltk.corpus import stopwords
from gensim.models import Word2Vec

## Import the datasets

In [109]:
dataset = load_dataset("climatebert/climate_sentiment")

In [110]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 320
    })
})

In [111]:
pd_train = pd.DataFrame.from_dict(dataset["train"])
pd_test = pd.DataFrame.from_dict(dataset["test"])

In [112]:
pd_train

Unnamed: 0,text,label
0,− Scope 3: Optional scope that includes indire...,1
1,The Group is not aware of any noise pollution ...,0
2,Global climate change could exacerbate certain...,0
3,Setting an investment horizon is part and parc...,0
4,Climate change the physical impacts of climate...,0
...,...,...
995,Greenhouse gas Mitigation Measures Our five ye...,1
996,We have updated our external sector statements...,1
997,STOREBRAND'S USE Task Force on Climate-related...,0
998,Estimations of nanced emissions indicate the i...,1


## RNN

### Data preprocessing

In [132]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    
    return text

pd_train['text'] = pd_train['text'].apply(clean_text)
pd_train['text'] = pd_train['text'].str.replace('\d+', '')

pd_test['text'] = pd_test['text'].apply(clean_text)
pd_test['text'] = pd_test['text'].str.replace('\d+', '')

In [133]:
text_train = pd_train["text"].values
label_train = pd_train["label"].values

text_test = pd_test["text"].values
label_test = pd_test["label"].values

In [134]:
# define max words for the vocabulary
MAX_WORDS = 50000
tokenizer_train = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer_test = Tokenizer(num_words=MAX_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)

# fit dataset to tokenizer
tokenizer_train.fit_on_texts(text_train)
tokenizer_test.fit_on_texts(text_test)

# convert dataset to sequence of integer
seq_train = tokenizer_train.texts_to_sequences(text_train)
seq_test = tokenizer_test.texts_to_sequences(text_test)

In [135]:
# pad the sequence to fixed_length, will adjust later
MAX_SEQ = 100
X_train = pad_sequences(sequences=seq_train, maxlen=MAX_SEQ)
X_test = pad_sequences(sequences=seq_test, maxlen=MAX_SEQ)

### Split the data

In [136]:
y_train = to_categorical(label_train, 3)
y_test = to_categorical(label_test, 3)
# y_train = label_train
# y_test = label_test

In [137]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42069)

### Create the RNN Model using Randomly Initialized Embedding

In [142]:
# create sequential model to stack layers
rnn = Sequential()

# embedding layer to convert integer tokens into dense vectors
rnn.add(Embedding(input_dim=MAX_WORDS, output_dim=100, input_length=X_train.shape[1]))

# performs variational dropout in NLP models
rnn.add(SpatialDropout1D(rate=0.2))

# bidirectional with 100 unit
# process sequence in both direction, it's said to capture context efficiently
rnn.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))

# add final layer of 50 unit
rnn.add(Bidirectional(LSTM(50)))

# add dense layer, with 3 output and softmax activation
rnn.add(Dense(3, activation="softmax"))

# compile the RNN model
rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [149]:
# train the model
rnn_history = rnn.fit(X_train, y_train, epochs=8, batch_size=100, validation_data=(X_val, y_val))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [150]:
# evaluate the models
loss, accuracy = rnn.evaluate(X_test, y_test)
print(f"Loss:\t{loss:.4f}")
print(f"Accuracy:\t{accuracy:.4f}")

Loss:	3.0449
Accuracy:	0.3781


### Create the RNN models using Word2Vec embedding

In [122]:
w2v = Word2Vec(sentences=text_train, vector_size=128, window=5, min_count=1, sg=0)
w2v.save("word2vec.model")

In [123]:
embedding_matrix = np.zeros((MAX_WORDS, 128))
for word, i in tokenizer_train.word_index.items():
    if i < MAX_WORDS:
        if word in w2v.wv:
            embedding_matrix[i] = w2v.wv[word]

In [124]:
embedding_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [125]:
# create sequential model to stack layers
rnn_w2v = Sequential()

# embedding layer to convert integer tokens into dense vectors
# change the weight to embedding_matrix from Word2Vec
rnn_w2v.add(Embedding(input_dim=MAX_WORDS, output_dim=100, input_length=X_train.shape[1], weights=[embedding_matrix], trainable=True))

# performs variational dropout in NLP models
rnn_w2v.add(SpatialDropout1D(rate=0.2))

# bidirectional with 100 unit
# process sequence in both direction, it's said to capture context efficiently
rnn_w2v.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))

# add final layer of 50 unit
rnn_w2v.add(Bidirectional(LSTM(50)))

# add dense layer, with 3 output and softmax activation
rnn_w2v.add(Dense(3, activation="softmax"))

# compile the RNN model
rnn_w2v.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [126]:
# train the model
rnn_w2v_history = rnn_w2v.fit(X_train, y_train, epochs=8, batch_size=16, validation_data=(X_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [128]:
# evaluate the models
loss, accuracy = rnn_w2v.evaluate(X_test, y_test)
print(f"Loss:\t{loss:.4f}")
print(f"Accuracy:\t{accuracy:.4f}")

Loss:	3.3066
Accuracy:	0.3531
