<a href="https://colab.research.google.com/github/aadityakhant/fakeNewsDetection/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U "tensorflow-text==2.8.*"
!pip install -q -U tf-models-official==2.7.0
!pip install -U tfds-nightly

In [None]:
import os
import csv
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa
from official.nlp import optimization
import numpy as np

tf.get_logger().setLevel('ERROR')

In [2]:
def make_bert_preprocess_model(sentence_features, seq_length=128):

  input_segments = [
      tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
      for ft in sentence_features]

  # Tokenize the text to word pieces.
  bert_preprocess = hub.load('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')
  tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
  truncated_segments = [tokenizer(s) for s in input_segments]
  packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                          arguments=dict(seq_length=seq_length),
                          name='packer')
  model_inputs = packer(truncated_segments)
  return tf.keras.Model(input_segments, model_inputs)

  test_preprocess_model = make_bert_preprocess_model(['title', 'content'])
  tf.keras.utils.plot_model(test_preprocess_model, show_shapes=True)

In [3]:
def build_classifier_model(num_classes):

  class Classifier(tf.keras.Model):
    def __init__(self, num_classes):
      super(Classifier, self).__init__(name="prediction")
      self.encoder = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1', trainable=True)
      self.dropout = tf.keras.layers.Dropout(0.1)
      self.dense = tf.keras.layers.Dense(num_classes)

    def call(self, preprocessed_text):
      encoder_outputs = self.encoder(preprocessed_text)
      pooled_output = encoder_outputs["pooled_output"]
      x = self.dropout(pooled_output)
      x = self.dense(x)
      return x

  model = Classifier(num_classes)
  return model

In [4]:
classes = {'fake':0, 'clickbait':1, 'valid':2, 'bias':3}

news_train={'x1':[], 'x2':[], 'y':[]}
for i in range(360):
    file = open('./drive/MyDrive/data2/pickle-'+str(i)+'.csv', 'r', encoding='latin-1')
    reader = csv.reader(file)
    for r in reader:
      news_train['x1'].append(r[0])
      news_train['x2'].append(r[1])
      news_train['y'].append(classes[r[2]])

news_valid={'x1':[], 'x2':[], 'y':[]}
for i in range(320, 360):
    file = open('./drive/MyDrive/data2/pickle-'+str(i)+'.csv', 'r', encoding='latin-1')
    reader = csv.reader(file)
    for r in reader:
      news_valid['x1'].append(r[0])
      news_valid['x2'].append(r[1])
      news_valid['y'].append(classes[r[2]])

news_test={'x1':[], 'x2':[], 'y':[]}
for i in range(360, 400):
    file = open('./drive/MyDrive/data2/pickle-'+str(i)+'.csv', 'r', encoding='latin-1')
    reader = csv.reader(file)
    for r in reader:
      news_test['x1'].append(r[0])
      news_test['x2'].append(r[1])
      news_test['y'].append(classes[r[2]])

In [None]:
def load_dataset_from_tfds(in_memory_ds, train, batch_size, bert_preprocess_model):
  dataset = tf.data.Dataset.from_tensor_slices(in_memory_ds)
  if train:
    dataset = dataset.shuffle(len(dataset))
  dataset = dataset.batch(batch_size)
  dataset = dataset.map(lambda x: (bert_preprocess_model([x['x1'],x['x2']]), x['y']))
  dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
  return dataset

In [None]:
epochs = 3
batch_size = 32
init_lr = 2e-5

bert_preprocess_model = make_bert_preprocess_model(['title', 'content'])

train_dataset = load_dataset_from_tfds(
    news_train, train=True, batch_size=batch_size,
    bert_preprocess_model=bert_preprocess_model)
steps_per_epoch = 160000 // batch_size
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = num_train_steps // 10

validation_dataset = load_dataset_from_tfds(
    news_valid, train=False, batch_size=batch_size,
    bert_preprocess_model=bert_preprocess_model)
validation_steps = 20000 // batch_size

metrics = tf.keras.metrics.SparseCategoricalAccuracy(
      'accuracy', dtype=tf.float32)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

classifier_model = build_classifier_model(4)

optimizer = optimization.create_optimizer(
    init_lr=init_lr,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

In [None]:
classifier_model.fit(
    x=validation_dataset,
    validation_data=validation_dataset,
    steps_per_epoch=steps_per_epoch,
    epochs=epochs,
    validation_steps=validation_steps)