<a href="https://colab.research.google.com/github/aadityakhant/fakeNewsDetection/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U "tensorflow-text==2.8.*" --quiet
!pip install -q -U tf-models-official==2.7.0 --quiet
!pip install -U tfds-nightly --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.
tfds-nightly 4.9.4.dev202405020044 requires protobuf>=3.20, but you have protobuf 3.19.6 which is incompatible.[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-gbq 0.19.2 requires google-auth-oauthlib>=0.7.0, but you have google-auth-oauthlib 0.4.6 which is incompatible.
tensorflow 2.8.4 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.3 which is incompatible.[0m[31m
[0m

In [14]:
from IPython.display import HTML, display
def progress(value, max=100):
    return HTML("""
        <progress
            value='{value}'
            max='{max}',
            style='width: 100%'
        >
            {value}
        </progress>
    """.format(value=value, max=max))

In [17]:
import os
import csv
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np

tf.get_logger().setLevel('ERROR')
out = display(progress(0, 25000), display_id=True)

def make_bert_preprocess_model(sentence_features, seq_length=128):

  input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

  # Tokenize the text to word pieces.
  bert_preprocess = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
  tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  truncated_segments = [tokenizer(s) for s in input_segments]
  packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer(truncated_segments)
  return tf.keras.Model(input_segments, model_inputs)


bert_preprocess_model = make_bert_preprocess_model(['title', 'content'])

def build_classifier_model(num_classes):

  class Classifier(tf.keras.Model):
    def __init__(self, num_classes):
      super(Classifier, self).__init__(name="prediction")
      self.encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1', trainable=True)
      self.dropout = tf.keras.layers.Dropout(0.1)
      self.dense = tf.keras.layers.Dense(num_classes)

    def call(self, preprocessed_text):
      encoder_outputs = self.encoder(preprocessed_text)
      pooled_output = encoder_outputs["pooled_output"]
      x = self.dropout(pooled_output)
      x = self.dense(x)
      return x

  model = Classifier(num_classes)
  return model

def data_loader(l, r, length, bert_preprocess_model):
  classes = {'fake':0, 'clickbait':1, 'valid':2, 'bias':3}
  news={'input_mask':[], 'input_type_ids':[], 'input_word_ids':[]}
  labels = []
  for i in range(l,r):
      out.update(progress(i, 50))
      file = open('./drive/MyDrive/data2/pickle-'+str(i)+'.csv', 'r', encoding='latin-1')
      reader = csv.reader(file)
      for r in reader:
        pre = bert_preprocess_model([np.array([r[0]]), np.array([r[1]])])
        news['input_mask'].append(pre['input_mask'])
        news['input_type_ids'].append(pre['input_type_ids'])
        news['input_word_ids'].append(pre['input_word_ids'])
        labels.append(classes[r[2]])
      file.close()
  news['input_mask'] = tf.reshape(news['input_mask'], (length, 128))
  news['input_type_ids'] = tf.reshape(news['input_type_ids'], (length, 128))
  news['input_word_ids'] = tf.reshape(news['input_word_ids'], (length, 128))
  return (news,labels)

news_train = data_loader(0,50,25000,bert_preprocess_model)
news_valid = data_loader(320,322,1000,bert_preprocess_model)

def load_dataset_from_tfds(in_memory_ds, train, batch_size, ):
  dataset = tf.data.Dataset.from_tensor_slices(in_memory_ds)
  if train:
    dataset = dataset.shuffle(len(dataset))
  dataset = dataset.batch(batch_size)
  dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
  return dataset


In [18]:
epochs = 10
batch_size = 32
init_lr = 2e-5

train_dataset = load_dataset_from_tfds(news_train, train=True, batch_size=batch_size)
steps_per_epoch = 25000 // batch_size
num_train_steps = steps_per_epoch * epochs
#num_warmup_steps = num_train_steps // 10

validation_dataset = load_dataset_from_tfds(news_valid, train=False, batch_size=batch_size, )
validation_steps = 1000 // batch_size

metrics = tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

classifier_model = build_classifier_model(4)

optimizer = optimization.create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

checkpoint_path = "./drive/MyDrive/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [19]:
h = classifier_model.fit(
      x=train_dataset,
      validation_data=validation_dataset,
      steps_per_epoch=steps_per_epoch,
      epochs=epochs,
      validation_steps=validation_steps,
      callbacks=[cp_callback])

Epoch 1/10
Epoch 1: saving model to ./drive/MyDrive/cp.ckpt
Epoch 2/10
  1/781 [..............................] - ETA: 36:19 - loss: 0.9866 - accuracy: 0.6250
Epoch 2: saving model to ./drive/MyDrive/cp.ckpt


In [None]:
news_test = data_loader(360,400,20000,bert_preprocess_model)

In [9]:
test_dataset = load_dataset_from_tfds(news_test, train=False, batch_size=batch_size, )
validation_steps = 20000 // batch_size

In [None]:
classifier_model.evaluate(test_dataset)

