## Import Modules

In [79]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from tensorflow import keras

from keras.layers import Embedding, LSTM, Dense, Bidirectional
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

## Import the datasets

In [2]:
dataset = load_dataset("climatebert/climate_sentiment")

Downloading readme:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/273k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/101k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/320 [00:00<?, ? examples/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 320
    })
})

In [57]:
pd_train = pd.DataFrame.from_dict(dataset["train"])
pd_test = pd.DataFrame.from_dict(dataset["test"])

In [58]:
text_train = pd_train["text"].values
label_train = pd_train["label"].values

text_test = pd_test["text"].values
label_test = pd_test["label"].values

## RNN

### Data preprocessing

In [70]:
# define max words for the vocabulary
MAX_WORDS = 10000
tokenizer_train = Tokenizer(num_words=MAX_WORDS)
tokenizer_test = Tokenizer(num_words=MAX_WORDS)

# fit dataset to tokenizer
tokenizer_train.fit_on_texts(text_train)
tokenizer_test.fit_on_texts(text_test)

# convert dataset to sequence of integer
seq_train = tokenizer_train.texts_to_sequences(text_train)
seq_test = tokenizer_test.texts_to_sequences(text_test)

In [71]:
# pad the sequence to fixed_length, will adjust later
MAX_SEQ = 500
X_train = pad_sequences(sequences=seq_train, maxlen=MAX_SEQ)
X_test = pad_sequences(sequences=seq_test, maxlen=MAX_SEQ)

### Split the data

In [80]:
y_train = to_categorical(label_train, 3)
y_test = to_categorical(label_test, 3)

In [81]:
y_train.shape

(1000, 3)

### Create the RNN Model

In [82]:
# create sequential model to stack layers
rnn = Sequential()

# embedding layer to convert integer tokens into dense vectors
rnn.add(Embedding(input_dim=MAX_WORDS, output_dim=128, input_length=X_train.shape[1]))

# bidirectional with 64 unit
# process sequence in both direction, it's said to capture context efficiently
rnn.add(Bidirectional(LSTM(128, return_sequences=True)))

# add final layer of 32 unit
rnn.add(Bidirectional(LSTM(64)))

# add dense layer, with 3 output and softmax activation
rnn.add(Dense(3, activation="softmax"))

In [83]:
# compile the RNN model
rnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [84]:
# train the model
rnn_history = rnn.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10