# 
<a id="data-prep"></a>
<div style="background-color: #000D5B; color: white; text-align: center; padding: 6px 0 22px 0">
    <h3 style="background-color: #000D5B; color: white; text-align: left">RMIT School of Computer Science and Technology</h3>
    <br/>
    <h1>COSC3007: Deep Learning</h1>
    <h2>Assignment 2: Stance Twitter Sentiment Analysis and Detection </h2>
    <h2> [2] MODELLING AND MODEL EVALUATIONS </h2>

</div>

# Import Libs

In [1]:
import pandas as pd 
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import os
import pathlib
import shutil
import tempfile
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

import numpy as np

# [1] Load Data

In [2]:
train_df= pd.read_csv('./StanceDataset/TrainFileCleaned.csv',sep=',',encoding = 'unicode_escape')
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
train_ori = train_df.copy()

test_df= pd.read_csv('./StanceDataset/TestFileCleaned.csv',sep=',',encoding = 'unicode_escape')

# Filter out rows where the 'target' column contains 'Donald Trump'
filtered_test_df = test_df[~test_df['Target'].str.contains("Donald Trump", case=False, na=False)]

# Save the filtered dataset
test_df = filtered_test_df.copy()

# [2] Prepare label and fit data

## Split test and val

## Tweets

In [3]:
# Load GloVe embeddings
embedding_dim = 100  # This should match the GloVe
embeddings_index = {}
with open("./glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [4]:
#Replace '@' and '#' with a space in each tweet of train and test set and save to new lists
list_clean = list()
test_clean = list()
lines = train_df["preprocessed_tweet"].values.tolist()
testlines = test_df["preprocessed_tweet"].values.tolist()

for line in lines:
    line = re.sub(r'@','', line)
    line = re.sub(r'#','', line)
    words = line.split(" ")
    list_clean.append(words)
    
for line in testlines:
    line = re.sub(r'@','', line)
    line = re.sub(r'#','', line)
    words = line.split(" ")
    test_clean.append(words) 

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list_clean)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(list_clean)
test_sequences = tokenizer.texts_to_sequences(test_clean)

# Find the maximum sequence length
max_length = max(max(len(seq) for seq in train_sequences), max(len(seq) for seq in test_sequences))
print(f"Maximum sequence length: {max_length}")

# Get word index and vocabulary size
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1
print(f"Unique tokens: {len(word_index)}")
print(f"Vocabulary size: {vocab_size}")

# Pad sequences
train_pad = pad_sequences(train_sequences, maxlen=max_length, padding='post')
test_pad = pad_sequences(test_sequences, maxlen=max_length, padding='post')

# Extract target labels
train_stance = train_df["Stance"].values
train_target = train_df["Target"].values
test_stance = test_df["Stance"].values

# Create an embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Maximum sequence length: 17
Unique tokens: 6432
Vocabulary size: 6433


In [6]:
import numpy as np

validation_split = 0.20

# Define the number of validation samples
num_validation_samples = int(validation_split * train_pad.shape[0])

# Shuffle the training data
indices = np.arange(train_pad.shape[0])
np.random.shuffle(indices)
train_pad = train_pad[indices]
train_stance = train_stance[indices]
train_target = train_target[indices]

# One-hot encoding for stance and target classes
n_values_stance = np.max(train_stance) + 1
n_values_target = np.max(train_target) + 1
n_values_test = np.max(test_stance) + 1

train_stance_labels = np.eye(n_values_stance)[train_stance]
train_target_labels = np.eye(n_values_target)[train_target]
test_stance_labels = np.eye(n_values_test)[test_stance]

# Split the training data into train and validation sets
val_pad = train_pad[-num_validation_samples:]
val_stance_labels = train_stance_labels[-num_validation_samples:]
val_target_labels = train_target_labels[-num_validation_samples:]

train_pad = train_pad[:-num_validation_samples]
train_stance_labels = train_stance_labels[:-num_validation_samples]
train_target_labels = train_target_labels[:-num_validation_samples]

# Test dataset remains unchanged
test_pad_labels = test_stance_labels

In [7]:
#Print shape of train val test split for both target and stance
print('Shape of train_lad:', train_pad.shape)
print('Shape of train_target:', train_target_labels.shape)
print('Shape of train_stance:', train_stance_labels.shape)

print('Shape of val_pad:', val_pad.shape)
print('Shape of val_target:', val_target_labels.shape)
print('Shape of val_stance:', val_stance_labels.shape)

print('Shape of test_pad:', test_pad.shape)
print('Shape of test_labels:', test_pad_labels.shape)


Shape of train_lad: (2332, 17)
Shape of train_target: (2332, 5)
Shape of train_stance: (2332, 3)
Shape of val_pad: (582, 17)
Shape of val_target: (582, 5)
Shape of val_stance: (582, 3)
Shape of test_pad: (1249, 17)
Shape of test_labels: (1249, 3)


# [3] Create Model

In [8]:
from keras import backend as K
from tensorflow.keras.optimizers.legacy import Adam
from hyperopt import hp
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import keras_tuner as kt

Create f1 metrics

In [9]:
def f1_score_class(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))

    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

def f1_score(y_true, y_pred):
    # Convert predictions to one-hot format
    y_pred_one_hot = K.one_hot(K.argmax(y_pred), num_classes=3)

    # Calculate F1 score for each class
    f1s = [f1_score_class(y_true[:, i], y_pred_one_hot[:, i]) for i in range(3)]

    # Average F1 scores across all classes
    return K.mean(K.stack(f1s), axis=0)

Create custom adam rate

In [10]:
def learn_adam(lr):
    STEPS_PER_EPOCH = 73
    lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
        initial_learning_rate=lr,
        decay_steps=STEPS_PER_EPOCH * 1000,
        decay_rate=10,
        staircase=False)

    # Visualize the learning rate curve
    steps = np.linspace(0, 100000)
    plt.figure(figsize=(8, 6))
    plt.plot(steps / STEPS_PER_EPOCH, lr_schedule(steps))
    plt.ylim([0, max(plt.ylim())])
    plt.xlabel('Epoch')
    plt.ylabel('Learning Rate')

    return Adam(learning_rate=lr_schedule)

Create Call backs

Set up call backs and learning rates

In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Bidirectional, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers.legacy import Adam

# Define the EarlyStopping and ReduceLROnPlateau callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=1,
    restore_best_weights=True,
)

lr_scheduler = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.2,
    patience=5,
    min_lr=1e-5,
    verbose=1,
)

# Define the ModelCheckpoint callback
checkpoint_path = "transfer_learning_baseline_weight.h5"
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,    # Only save a model if 'val_loss' has improved
    monitor='val_loss',     # Monitor 'val_loss' during training
    mode='min',             # The model is saved when 'val_loss' is minimized
    verbose=1)

# Combine all callbacks in a list
callbacks = [
    early_stopping,
    lr_scheduler,
    model_checkpoint_callback
]

Set up call backs for trials

In [12]:
#Specify location to save model checkpoints
checkpoint_path = './'
checkpoint_dir = os.path.dirname(checkpoint_path)

#function definition for model callback; 
#Params {'name' : Name of the model, 'early_stop' : Specify early stopping (True/False) } 

def get_callbacks(name,early_stop=True):
  if early_stop:
      return [
              tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10,restore_best_weights=True),
              tf.keras.callbacks.TensorBoard(logdir/name, histogram_freq=60, embeddings_freq=60),
              tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path+name,
                                                 save_weights_only=True,
                                                 verbose=1)
      ]

  else:
    return [tf.keras.callbacks.TensorBoard(logdir/name)]
  
#Load the TensorBoard
logdir = pathlib.Path(tempfile.mkdtemp())/"tensorboard_logs"
shutil.rmtree(logdir, ignore_errors=True)

# Load the TensorBoard notebook extension
%load_ext tensorboard

# Open an embedded TensorBoard viewer
%tensorboard --logdir {logdir}/models


ERROR: Failed to launch TensorBoard (exited with 1).
Contents of stderr:
Address already in use
Port 6006 is in use by another program. Either identify and stop that program, or start the server with a different port.

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization, Bidirectional, Flatten
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers.legacy import Adam

def build_model(isStance, drop, regr):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(Bidirectional(LSTM(units=128,  dropout=drop, recurrent_dropout=drop,return_sequences=True,kernel_regularizer=tf.keras.regularizers.l2(regr))))
    model.add(Flatten())
    model.add(Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(regr)))
    model.add(Dropout(drop))
        
    if isStance:
    # Final Layer
        model.add(Dense(3, activation='softmax'))
    else:
        model.add(Dense(5, activation='softmax'))
    
      # Define the optimizer within the function
    optimizer = tf.keras.optimizers.Adam()

    model.compile(loss='CategoricalCrossentropy', optimizer=optimizer, metrics=['categorical_accuracy',f1_score])
    model.summary()

    return model


# [4] Training For Baseline Target

Training for base target model

In [14]:
# EPOCH
EPOCH = 100
#Selection of dropouts and lambda values for model training
m_histories = {}

#Fine tuning the hyper parameters on trainable layers
dropout_rate = [0.7, 0.8]
lambda_vals = [0.005, 0.001, 0.0005]

In [15]:
#Fit the model for each combination of lambda and dropout values
for reg_lambda in lambda_vals:
  for drop in dropout_rate:
   tf.keras.backend.clear_session()
   model_target = build_model(False, drop, reg_lambda)
   m_histories['target_with_TL'+ '_hp_' + str(reg_lambda)+str(drop)] = model_target.fit(train_pad, train_target_labels, batch_size=32, epochs=EPOCH, validation_data=(val_pad, val_target_labels), callbacks=get_callbacks('models/base_target_with_TL'+ '_h_' + str(reg_lambda),early_stop=True), verbose=1)



2023-12-29 14:47:38.968543: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2023-12-29 14:47:38.968575: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2023-12-29 14:47:38.968585: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2023-12-29 14:47:38.968637: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-29 14:47:38.968664: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 100)           643300    
                                                                 
 bidirectional (Bidirection  (None, 17, 256)           234496    
 al)                                                             
                                                                 
 flatten (Flatten)           (None, 4352)              0         
                                                                 
 dense (Dense)               (None, 128)               557184    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 5)                 645       
                                                        

2023-12-29 14:47:40.808433: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-12-29 14:49:21.002921: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.



Epoch 1: saving model to ./models/base_target_with_TL_h_0.005
Epoch 2/100
Epoch 2: saving model to ./models/base_target_with_TL_h_0.005
Epoch 3/100
Epoch 3: saving model to ./models/base_target_with_TL_h_0.005
Epoch 4/100
Epoch 4: saving model to ./models/base_target_with_TL_h_0.005
Epoch 5/100
Epoch 5: saving model to ./models/base_target_with_TL_h_0.005
Epoch 6/100
Epoch 6: saving model to ./models/base_target_with_TL_h_0.005
Epoch 7/100
Epoch 7: saving model to ./models/base_target_with_TL_h_0.005
Epoch 8/100
Epoch 8: saving model to ./models/base_target_with_TL_h_0.005
Epoch 9/100
Epoch 9: saving model to ./models/base_target_with_TL_h_0.005
Epoch 10/100
Epoch 10: saving model to ./models/base_target_with_TL_h_0.005
Epoch 11/100
Epoch 11: saving model to ./models/base_target_with_TL_h_0.005
Epoch 12/100
Epoch 12: saving model to ./models/base_target_with_TL_h_0.005
Epoch 13/100
Epoch 13: saving model to ./models/base_target_with_TL_h_0.005
Epoch 14/100
Epoch 14: saving model to ./











Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 100)           643300    
                                                                 
 bidirectional (Bidirection  (None, 17, 256)           234496    
 al)                                                             
                                                                 
 flatten (Flatten)           (None, 4352)              0         
                                                                 
 dense (Dense)               (None, 128)               557184    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 5)                 645       
                                                        

2023-12-29 17:06:22.300897: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-12-29 17:08:44.171111: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.



Epoch 1: saving model to ./models/base_target_with_TL_h_0.005
Epoch 2/100
Epoch 2: saving model to ./models/base_target_with_TL_h_0.005
Epoch 3/100
Epoch 3: saving model to ./models/base_target_with_TL_h_0.005
Epoch 4/100
Epoch 4: saving model to ./models/base_target_with_TL_h_0.005
Epoch 5/100
Epoch 5: saving model to ./models/base_target_with_TL_h_0.005
Epoch 6/100
Epoch 6: saving model to ./models/base_target_with_TL_h_0.005
Epoch 7/100
Epoch 7: saving model to ./models/base_target_with_TL_h_0.005
Epoch 8/100
Epoch 8: saving model to ./models/base_target_with_TL_h_0.005
Epoch 9/100
Epoch 9: saving model to ./models/base_target_with_TL_h_0.005
Epoch 10/100
Epoch 10: saving model to ./models/base_target_with_TL_h_0.005
Epoch 11/100
Epoch 11: saving model to ./models/base_target_with_TL_h_0.005
Epoch 12/100
Epoch 12: saving model to ./models/base_target_with_TL_h_0.005
Epoch 13/100
Epoch 13: saving model to ./models/base_target_with_TL_h_0.005
Epoch 14/100
Epoch 14: saving model to ./











Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17, 100)           643300    
                                                                 
 bidirectional (Bidirection  (None, 17, 256)           234496    
 al)                                                             
                                                                 
 flatten (Flatten)           (None, 4352)              0         
                                                                 
 dense (Dense)               (None, 128)               557184    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 5)                 645       
                                                        

2023-12-29 20:41:09.270201: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-12-29 20:43:39.996177: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.



Epoch 1: saving model to ./models/base_target_with_TL_h_0.001
Epoch 2/100
Epoch 2: saving model to ./models/base_target_with_TL_h_0.001
Epoch 3/100
Epoch 3: saving model to ./models/base_target_with_TL_h_0.001
Epoch 4/100
Epoch 4: saving model to ./models/base_target_with_TL_h_0.001
Epoch 5/100
Epoch 5: saving model to ./models/base_target_with_TL_h_0.001
Epoch 6/100
Epoch 6: saving model to ./models/base_target_with_TL_h_0.001
Epoch 7/100
Epoch 7: saving model to ./models/base_target_with_TL_h_0.001
Epoch 8/100
Epoch 8: saving model to ./models/base_target_with_TL_h_0.001
Epoch 9/100
Epoch 9: saving model to ./models/base_target_with_TL_h_0.001
Epoch 10/100
Epoch 10: saving model to ./models/base_target_with_TL_h_0.001
Epoch 11/100
Epoch 11: saving model to ./models/base_target_with_TL_h_0.001
Epoch 12/100
Epoch 12: saving model to ./models/base_target_with_TL_h_0.001
Epoch 13/100
Epoch 13: saving model to ./models/base_target_with_TL_h_0.001
Epoch 14/100
Epoch 14: saving model to ./

KeyboardInterrupt: 