# Modeling Tweet classification with RNNs for Binary & Multiclass cases
**Wendy Matta**

In [None]:
# %pip install numpy
# %pip install scipy
# %pip install tensorflow

In [None]:
# mount google drive
USE_COLLAB = True

if USE_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  path_dir = 'drive/MyDrive/'
else:
  path_dir = ''

Mounted at /content/drive


## Binary Classification: Mental Health related vs. Normal

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, LSTM
from scipy.sparse import load_npz
from tensorflow.keras.callbacks import EarlyStopping
import gc
from sklearn.metrics import accuracy_score, f1_score as f1
from tensorflow.keras.utils import to_categorical

# random seed
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [None]:

# Load the .npz files
X_train = load_npz(path_dir+'X_train_bow.npz')
X_train = X_train.astype(np.float32)  # Cast to float32
X_val = load_npz(path_dir+'X_val_bow.npz')
X_val = X_val.astype(np.float32)  # Cast to float32
X_test = load_npz(path_dir+'X_test_bow.npz')
X_test = X_test.astype(np.float32)  # Cast to float32

# Load the .npy files
y_train = np.load(path_dir+'y_train.npy')
y_val = np.load(path_dir+'y_val.npy')
y_test = np.load(path_dir+'y_test.npy')

In [None]:
# sanity check
assert X_train.shape[1] == X_val.shape[1] == X_test.shape[1]
assert X_train.shape[0] > X_val.shape[0]
assert X_train.shape[0] > X_test.shape[0]
assert X_train.shape[0] == y_train.shape[0]

### ETL

We begin by turning our matrices into dense versions of themselves.

In [None]:
#Transform from sparse matrices to dense matrices b/c tf doesn't have a lot of support for modeling with sparse
def transform_matrix(x): # For vanilla RNN
    dense_matrix = x.toarray()
    return dense_matrix.reshape((dense_matrix.shape[0], dense_matrix.shape[1], 1))

def reshape_for_lstm(x): # for LSTM
    dense = x.toarray()
    # turn into float32
    dense = dense.astype(np.float32)
    return dense.reshape((dense.shape[0], 1, dense.shape[1]))

In [None]:
X_train_lstm = reshape_for_lstm(X_train)
X_val_lstm = reshape_for_lstm(X_val)
X_test_lstm = reshape_for_lstm(X_test)

#Reshape arrays to work better with tf
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(31608, 1)
(10536, 1)
(10537, 1)


### Changing token sizes

We begin by reducing our dataset down to 500 tokens.

In [None]:
max_length = 500
X_train = X_train[:, :max_length]
X_val = X_val[:, :max_length]
X_test = X_test[:, :max_length]

Given that we intend on training different RNN variants, we want to prepare the dataset in different ways, like transforming from sparse to dense matrices and reshaping them to be 3-dimensional, as in the case of an LSTM:

However, our labels / response variables don't need specialized transformation:

## LSTM Model

In [None]:
def build_lstm(X_train, y_train, X_val, y_val, truncate_to: int = 40000):

  # Clear any TensorFlow session
  tf.keras.backend.clear_session()
  # Garbage collect objects no longer in use
  gc.collect()

  model = Sequential()

  # Add an LSTM layer
  model.add(LSTM(units=50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))

  # Add a Dense output layer (e.g., for binary classification)
  model.add(Dense(1, activation='sigmoid'))  # Use sigmoid for binary classification

  # Compile the model
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  # Step 6: Train the model
  model.fit(
      X_train,
      y_train,
      epochs=5, #number of cycles
      batch_size=16, #number of samples processed at a time
      validation_data=(X_val, y_val),
      callbacks=[
          EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
        ]
  )

  # Step 6.5: Add Early Stopping


  # Step 7: Evaluate the model
  loss, accuracy = model.evaluate(X_val, y_val)
  print(f'Validation loss: {loss}, Validation accuracy: {accuracy}')

  # Step 8: Make predictions
  predictions = model.predict(X_val)
  print(predictions)

  return {
      'predictions': predictions,
      'model': model,
      'loss': loss,
      'accuracy': accuracy
  }

In [None]:
model = build_lstm(X_train_lstm, y_train, X_val_lstm, y_val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Validation loss: 0.13945114612579346, Validation accuracy: 0.9501708149909973
[[0.99999946]
 [0.3109116 ]
 [0.9802329 ]
 ...
 [0.8580782 ]
 [0.99998486]
 [0.04035855]]


In [None]:
# run model against test data and get accuracy and F1 score
# get predictions
model = model['model']
predictions = model.predict(X_test_lstm)

# get accuracy and f1 score from predictions
accuracy = accuracy_score(y_test, predictions.round())
f1_score = f1(y_test, predictions.round())

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1_score}')

Accuracy: 0.9474233652842365
F1 Score: 0.9616024397005822


Our model generalizes well to the testing data with an accuracy of ~95% and an F1 score of %96.

# Multiclass Modeling for Tweets

First, we read in our multi class labels:

In [None]:
# Load the .npy files
NUM_CLASSES = 7

y_train = np.load(path_dir+'y_train_multi.npy')
y_val = np.load(path_dir+'y_val_multi.npy')
y_test = np.load(path_dir+'y_test_multi.npy')

In [None]:
# sanity check, as a treat
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

# our labels should have 7 different classes
assert len(np.unique(y_train)) == NUM_CLASSES
assert len(np.unique(y_val)) == NUM_CLASSES
assert len(np.unique(y_test)) == NUM_CLASSES

(31608,)
(10536,)
(10537,)


In [None]:
# convert to one-hot encoding
y_train = to_categorical(y_train, num_classes=NUM_CLASSES)
y_val = to_categorical(y_val, num_classes=NUM_CLASSES)
y_test = to_categorical(y_test, num_classes=NUM_CLASSES)

Next, we begin by creating a multiclass LSTM model that has multiple output nodes:

In [None]:
def build_lstm_multi(X_train, y_train, X_val, y_val, truncate_to: int = 40000):

  # Clear any TensorFlow session
  tf.keras.backend.clear_session()
  # Garbage collect objects no longer in use
  gc.collect()

  model = Sequential()

  # Add an LSTM layer
  model.add(LSTM(units=50, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])))

  # Add a Dense output layer (e.g., for binary classification)
  model.add(Dense(NUM_CLASSES, activation='softmax'))

  # Compile the model
  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

  # Step 6: Train the model
  model.fit(
      X_train,
      y_train,
      epochs=5, #number of cycles
      batch_size=16, #number of samples processed at a time
      validation_data=(X_val, y_val),
      callbacks=[
          EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
        ]
  )

  # Step 6.5: Add Early Stopping


  # Step 7: Evaluate the model
  loss, accuracy = model.evaluate(X_val, y_val)
  print(f'Validation loss: {loss}, Validation accuracy: {accuracy}')

  # Step 8: Make predictions
  predictions = model.predict(X_val)
  print(predictions)

  return {
      'predictions': predictions,
      'model': model,
      'loss': loss,
      'accuracy': accuracy
  }

In [None]:
model = build_lstm_multi(X_train_lstm, y_train, X_val_lstm, y_val)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Validation loss: 0.658083438873291, Validation accuracy: 0.7705011367797852
[[3.24647466e-04 4.29613516e-03 7.29089677e-01 ... 4.57335413e-02
  1.14877336e-03 2.19407141e-01]
 [1.48954997e-02 3.95857496e-03 1.06858590e-03 ... 1.49605144e-02
  4.10000294e-01 1.99537426e-05]
 [3.56217526e-04 6.69010682e-04 4.92762960e-03 ... 6.79569726e-04
  9.91446078e-01 1.52989035e-03]
 ...
 [2.29061581e-02 1.80740021e-02 1.80418700e-01 ... 4.00814414e-02
  3.64792831e-02 5.42077482e-01]
 [9.24810651e-04 3.49389715e-03 1.36335582e-01 ... 2.28603673e-03
  6.75614574e-04 8.56282949e-01]
 [2.35881377e-03 6.92092581e-04 1.09584145e-02 ... 7.75438093e-04
  2.25055544e-03 2.26997957e-02]]


In [None]:
# run model against test data and get accuracy and F1 score
# get predictions
model = model['model']
predictions = model.predict(X_test_lstm)

# get accuracy and f1 score from predictions
accuracy = accuracy_score(y_test, predictions.round())
f1_score = f1(y_test, predictions.round(), average='weighted')

print(f'Accuracy: {accuracy}')
print(f'F1 Score: {f1_score}')

Accuracy: 0.7322767391098035
F1 Score: 0.7576816517319498
