<a href="https://colab.research.google.com/github/arnabbiswas66/multimodal-fake-news-classifier/blob/main/Twitter_multimodal_classifier_Scaled_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
#!pip install -q -U "tensorflow-text==2.11.*"
!pip install -q tensorflow_text
!pip install -q talos

In [1]:
from tensorflow.python.platform.tf_logging import warn
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os
import re
from os import listdir
import shutil
import glob
import random
import matplotlib.pyplot as plt
import talos as ta
import tensorflow_hub as hub
import tensorflow_text as text
tf.get_logger().setLevel('ERROR')
import warnings

warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
BASE_SAVE_LOCATION = "/content/drive/MyDrive/multimodal-news"
CHECKPOINT_FILEPATH = '/content/drive/MyDrive/multimodal-news/twitter-models/'

## Load the dataframes

In [3]:
#Load training df
train_df_model = pd.read_pickle(BASE_SAVE_LOCATION+"/twitter-train.pickle")
test_df_model = pd.read_pickle(BASE_SAVE_LOCATION+"/twitter-test.pickle")
print(train_df_model.shape, test_df_model.shape)

(14258, 3) (1923, 3)


In [4]:
# This is for Talos as it doesn't take TF DataSet as input
X_train = train_df_model.copy()
y_train = X_train.pop('label')

X_test = test_df_model.copy()
y_test = X_test.pop('label')

# Data input pipeline

In [5]:
# Define TF Hub paths to the BERT encoder and its preprocessor
bert_model_path = (
    "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1"
)
bert_preprocess_path = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

In [6]:
def make_bert_preprocessing_model(sentence_features, seq_length=224):
    """Returns Model mapping string features to BERT inputs.

  Args:
    sentence_features: A list with the names of string-valued features.
    seq_length: An integer that defines the sequence length of BERT inputs.

  Returns:
    A Keras Model that can be called on a list or dict of string Tensors
    (with the order or names, resp., given by sentence_features) and
    returns a dict of tensors for input to BERT.
  """

    input_segments = [
        tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
        for ft in sentence_features
    ]

    # Tokenize the text to word pieces.
    bert_preprocess = hub.load(bert_preprocess_path)
    tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name="tokenizer")
    segments = [tokenizer(s) for s in input_segments]

    # Optional: Trim segments in a smart way to fit seq_length.
    # Simple cases (like this example) can skip this step and let
    # the next step apply a default truncation to approximately equal lengths.
    truncated_segments = segments

    # Pack inputs. The details (start/end token ids, dict of output tensors)
    # are model-dependent, so this gets loaded from the SavedModel.
    packer = hub.KerasLayer(
        bert_preprocess.bert_pack_inputs,
        arguments=dict(seq_length=seq_length),
        name="packer",
    )
    model_inputs = packer(truncated_segments)
    return keras.Model(input_segments, model_inputs)


In [7]:
bert_preprocess_model = make_bert_preprocessing_model(["text_1"])
bert_preprocess_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_1 (InputLayer)         [(None,)]                 0         
                                                                 
 tokenizer (KerasLayer)      (None, None, None)        0         
                                                                 
 packer (KerasLayer)         {'input_mask': (None, 22  0         
                             4),                                 
                              'input_word_ids': (None            
                             , 224),                             
                              'input_type_ids': (None            
                             , 224)}                             
                                                                 
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [8]:
idx = np.random.choice(len(train_df_model))
row = train_df_model.iloc[idx]
sample_text_1, sample_text_2 = row["tweetText"], row["tweetText"]
print(f"Text 1: {sample_text_1}")
print(f"Text 2: {sample_text_2}")

test_text = [np.array([sample_text_1])]
text_preprocessed = bert_preprocess_model(test_text)

print("Keys           : ", list(text_preprocessed.keys()))
print("Shape Word Ids : ", text_preprocessed["input_word_ids"].shape)
print("Word Ids       : ", text_preprocessed["input_word_ids"][0, :16])
print("Shape Mask     : ", text_preprocessed["input_mask"].shape)
print("Input Mask     : ", text_preprocessed["input_mask"][0, :16])
print("Shape Type Ids : ", text_preprocessed["input_type_ids"].shape)
print("Type Ids       : ", text_preprocessed["input_type_ids"][0, :16])


Text 1: #SANDY ~ Prayers go out to the people affected in Hoboken, NJ http://t.co/Ax9qluv5
Text 2: #SANDY ~ Prayers go out to the people affected in Hoboken, NJ http://t.co/Ax9qluv5
Keys           :  ['input_mask', 'input_word_ids', 'input_type_ids']
Shape Word Ids :  (1, 224)
Word Ids       :  tf.Tensor(
[  101  1001  7525  1066 12583  2175  2041  2000  1996  2111  5360  1999
  7570  5092  7520  1010], shape=(16,), dtype=int32)
Shape Mask     :  (1, 224)
Input Mask     :  tf.Tensor([1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1], shape=(16,), dtype=int32)
Shape Type Ids :  (1, 224)
Type Ids       :  tf.Tensor([0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(16,), dtype=int32)


In [9]:
def dataframe_to_dataset(dataframe):
    columns = ['tweetText', 'image_1', 'label']
    dataframe = dataframe[columns].copy()
    labels = dataframe.pop("label")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds

## Preprocessing utilities

In [10]:
resize = (224, 224)
bert_input_features = ["input_word_ids", "input_type_ids", "input_mask"]

In [11]:
def preprocess_image(image_path):
  extension = tf.strings.split(image_path,'.')[-1]
  image = tf.io.read_file(image_path)
  if extension == b"gif":
    image = tf.io.decode_image(image, 3, expand_animations=False)
  elif extension == b"png":
    image = tf.image.decode_png(image, 3)
  else:
    image = tf.image.decode_jpeg(image, 3)
  image = tf.image.resize(image, resize)
  return image

In [12]:
def preprocess_text(text_1):
  text_1 = tf.convert_to_tensor([text_1])
  output = bert_preprocess_model([text_1])
  output = {feature: tf.squeeze(output[feature]) for feature in bert_input_features}
  return output

In [13]:
def preprocess_text_and_image(sample):
  image_1 = preprocess_image(sample["image_1"])
  text = preprocess_text(sample["tweetText"])
  return {"image_1": image_1, "text": text}

In [14]:
batch_size = 32
auto = tf.data.AUTOTUNE

In [15]:
def prepare_dataset(dataframe, training=True):
  ds = dataframe_to_dataset(dataframe)
  if training:
      ds = ds.shuffle(len(train_df_model))
  ds = ds.map(lambda x, y: (preprocess_text_and_image(x), y)).cache()
  ds = ds.batch(batch_size).prefetch(auto)
  return ds

In [16]:
train_ds = prepare_dataset(train_df_model)
#validation_ds = prepare_dataset(val_df_model, False)
test_ds = prepare_dataset(test_df_model, False)

## Model Building

### Projection

In [17]:
def project_embeddings(
    embeddings, num_projection_layers, projection_dims, dropout_rate
):
    projected_embeddings = keras.layers.Dense(units=projection_dims)(embeddings)
    for _ in range(num_projection_layers):
        x = tf.nn.gelu(projected_embeddings)
        x = keras.layers.Dense(projection_dims)(x)
        x = keras.layers.Dropout(dropout_rate)(x)
        x = keras.layers.Add()([projected_embeddings, x])
        projected_embeddings = keras.layers.LayerNormalization()(x)
    return projected_embeddings


### Vision encoder

In [18]:
def create_vision_encoder(
    num_projection_layers, projection_dims, dropout_rate, trainable=False
):
    # Load the pre-trained ResNet50V2 model to be used as the base encoder.
    resnet_v2 = keras.applications.EfficientNetV2B3(
        include_top=False, weights="imagenet", pooling="avg"
    )
    # Set the trainability of the base encoder.
    for layer in resnet_v2.layers:
        layer.trainable = trainable

    # Receive the images as inputs.
    image_1 = keras.Input(shape=(224, 224, 3), name="image_1")
    
    # Preprocess the input image.
    preprocessed_1 = keras.applications.resnet_v2.preprocess_input(image_1)
    
    # Generate the embeddings for the images using the resnet_v2 model
    # concatenate them.
    embeddings = resnet_v2(preprocessed_1)
    #embeddings = keras.layers.Concatenate()([embeddings_1, embeddings_2])

    # Project the embeddings produced by the model.
    outputs = project_embeddings(
        embeddings, num_projection_layers, projection_dims, dropout_rate
    )
    # Create the vision encoder model.
    return keras.Model([image_1], outputs, name="vision_encoder")

### Text Encoder

In [19]:
def create_text_encoder(
    num_projection_layers, projection_dims, dropout_rate, trainable=False
):
    # Load the pre-trained BERT model to be used as the base encoder.
    bert = hub.KerasLayer(bert_model_path, name="bert",)
    # Set the trainability of the base encoder.
    bert.trainable = trainable

    # Receive the text as inputs.
    bert_input_features = ["input_type_ids", "input_mask", "input_word_ids"]
    inputs = {
        feature: keras.Input(shape=(224,), dtype=tf.int32, name=feature)
        for feature in bert_input_features
    }

    # Generate embeddings for the preprocessed text using the BERT model.
    embeddings = bert(inputs)["pooled_output"]

    # Project the embeddings produced by the model.
    outputs = project_embeddings(
        embeddings, num_projection_layers, projection_dims, dropout_rate
    )
    # Create the text encoder model.
    return keras.Model(inputs, outputs, name="text_encoder")

### Multi Head Attention Layer

In [20]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, x, y, training):
        attn_output = self.att(x, y)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

## MultiModal model

In [21]:
def create_multimodal_model(
    num_projection_layers=0,
    projection_dims=224,
    dropout_rate=0.1,
    vision_trainable=False,
    text_trainable=False,
    attention=False
):
    # Receive the images as inputs.
    image_1 = keras.Input(shape=(224, 224, 3), name="image_1")
    
    # Receive the text as inputs.
    bert_input_features = ["input_type_ids", "input_mask", "input_word_ids"]
    text_inputs = {
        feature: keras.Input(shape=(224,), dtype=tf.int32, name=feature)
        for feature in bert_input_features
    }

    # Create the encoders.
    vision_encoder = create_vision_encoder(
        num_projection_layers, projection_dims, dropout_rate, vision_trainable
    )
    text_encoder = create_text_encoder(
        num_projection_layers, projection_dims, dropout_rate, text_trainable
    )

    # Fetch the embedding projections.
    vision_projections = vision_encoder([image_1])
    vision_projections = keras.layers.Dropout(dropout_rate)(vision_projections)
    text_projections = text_encoder(text_inputs)
    text_projections = keras.layers.Dropout(dropout_rate)(text_projections)
    
    # Cross-attention.
    if attention:
      transformer_block = TransformerBlock(projection_dims, 4, projection_dims)
      x = transformer_block(tf.expand_dims(vision_projections, -1), tf.expand_dims(text_projections, -1))
      x = tf.keras.layers.Flatten()(x)
      
    # Concatenate the projections and pass through the classification layer.
    concatenated = keras.layers.Concatenate()([vision_projections, text_projections])
    if attention:
        concatenated = keras.layers.Concatenate()([concatenated, x])
        #x = tf.keras.layers.Flatten(x)
    outputs = keras.layers.Dense(1, activation="sigmoid")(concatenated)
    return keras.Model([image_1, text_inputs], outputs)

In [22]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics= [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

## Final Multimodal model

In [23]:
def multimodal_model(X_train, y_train, X_test, y_test, params):
  train_ds = prepare_dataset(train_df_model)
  test_ds = prepare_dataset(test_df_model, False)
  model = create_multimodal_model(params['num_projection_layers'],
    params['projection_dims'],
    params['dropout_rate'],
    params['vision_trainable'],
    params['text_trainable'],
    params['attention'])
  model.compile(
    optimizer=tf.keras.optimizers.Adam(params['lr']), loss=loss, metrics=metrics
  )
  history = model.fit(
      train_ds, validation_data=test_ds, 
      epochs=params['epochs'], batch_size=params['batch_size'])
  return history, model
  

In [None]:
params = {
    'num_projection_layers' : [0],
    'projection_dims' : [128, 224],
    'dropout_rate' : [0.1, 0.2],
    'vision_trainable' : [False],
    'text_trainable' : [False],
    'attention' : [True],
    'lr' : [0.001, 0.0005],
    'epochs' : [10],
    'batch_size' : [32, 64]
}

In [None]:
h = ta.Scan(x = X_train, y= y_train, params = params, model = multimodal_model, x_val = X_test, y_val = y_test, experiment_name = 'multi1', save_weights=False, print_params=True)


  0%|          | 0/16 [00:00<?, ?it/s][A

{'num_projection_layers': 0, 'projection_dims': 128, 'dropout_rate': 0.1, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.001, 'epochs': 10, 'batch_size': 32}
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/efficientnet_v2/efficientnetv2-b3_notop.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



  6%|▋         | 1/16 [12:16<3:04:08, 736.55s/it][A

{'num_projection_layers': 0, 'projection_dims': 128, 'dropout_rate': 0.1, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.001, 'epochs': 10, 'batch_size': 64}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 12%|█▎        | 2/16 [22:56<2:38:33, 679.55s/it][A

{'num_projection_layers': 0, 'projection_dims': 128, 'dropout_rate': 0.1, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.0005, 'epochs': 10, 'batch_size': 32}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 19%|█▉        | 3/16 [34:26<2:28:19, 684.59s/it][A

{'num_projection_layers': 0, 'projection_dims': 128, 'dropout_rate': 0.1, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.0005, 'epochs': 10, 'batch_size': 64}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 25%|██▌       | 4/16 [45:06<2:13:20, 666.70s/it][A

{'num_projection_layers': 0, 'projection_dims': 128, 'dropout_rate': 0.2, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.001, 'epochs': 10, 'batch_size': 32}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 31%|███▏      | 5/16 [55:45<2:00:23, 656.70s/it][A

{'num_projection_layers': 0, 'projection_dims': 128, 'dropout_rate': 0.2, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.001, 'epochs': 10, 'batch_size': 64}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 38%|███▊      | 6/16 [1:07:16<1:51:25, 668.54s/it][A

{'num_projection_layers': 0, 'projection_dims': 128, 'dropout_rate': 0.2, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.0005, 'epochs': 10, 'batch_size': 32}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 44%|████▍     | 7/16 [1:17:55<1:38:49, 658.87s/it][A

{'num_projection_layers': 0, 'projection_dims': 128, 'dropout_rate': 0.2, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.0005, 'epochs': 10, 'batch_size': 64}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 50%|█████     | 8/16 [1:28:35<1:27:03, 652.94s/it][A

{'num_projection_layers': 0, 'projection_dims': 224, 'dropout_rate': 0.1, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.001, 'epochs': 10, 'batch_size': 32}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 56%|█████▋    | 9/16 [1:41:06<1:19:43, 683.40s/it][A

{'num_projection_layers': 0, 'projection_dims': 224, 'dropout_rate': 0.1, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.001, 'epochs': 10, 'batch_size': 64}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 62%|██████▎   | 10/16 [1:52:30<1:08:22, 683.71s/it][A

{'num_projection_layers': 0, 'projection_dims': 224, 'dropout_rate': 0.1, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.0005, 'epochs': 10, 'batch_size': 32}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 69%|██████▉   | 11/16 [2:04:13<57:27, 689.48s/it]  [A

{'num_projection_layers': 0, 'projection_dims': 224, 'dropout_rate': 0.1, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.0005, 'epochs': 10, 'batch_size': 64}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 75%|███████▌  | 12/16 [2:15:42<45:57, 689.44s/it][A

{'num_projection_layers': 0, 'projection_dims': 224, 'dropout_rate': 0.2, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.001, 'epochs': 10, 'batch_size': 32}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 81%|████████▏ | 13/16 [2:27:07<34:24, 688.09s/it][A

{'num_projection_layers': 0, 'projection_dims': 224, 'dropout_rate': 0.2, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.001, 'epochs': 10, 'batch_size': 64}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 88%|████████▊ | 14/16 [2:38:35<22:56, 688.10s/it][A

{'num_projection_layers': 0, 'projection_dims': 224, 'dropout_rate': 0.2, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.0005, 'epochs': 10, 'batch_size': 32}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



 94%|█████████▍| 15/16 [2:50:05<11:28, 688.56s/it][A

{'num_projection_layers': 0, 'projection_dims': 224, 'dropout_rate': 0.2, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.0005, 'epochs': 10, 'batch_size': 64}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10



100%|██████████| 16/16 [3:02:19<00:00, 683.75s/it]


In [24]:
params = {
    'num_projection_layers' : [1],
    'projection_dims' : [128, 224],
    'dropout_rate' : [0.1, 0.2],
    'vision_trainable' : [False],
    'text_trainable' : [False],
    'attention' : [True],
    'lr' : [0.001, 0.0005],
    'epochs' : [10],
    'batch_size' : [32]
}

In [None]:
h = ta.Scan(x = X_train, y= y_train, params = params, model = multimodal_model, x_val = X_test, y_val = y_test, experiment_name = 'multi1', save_weights=False, print_params=True)

  0%|          | 0/8 [00:00<?, ?it/s]

{'num_projection_layers': 1, 'projection_dims': 128, 'dropout_rate': 0.1, 'vision_trainable': False, 'text_trainable': False, 'attention': True, 'lr': 0.001, 'epochs': 10, 'batch_size': 32}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10