<a href="https://colab.research.google.com/github/adimyth/datascience_stuff/blob/master/nlp/BertTextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Downloading Data

In [1]:
import pandas as pd
from tensorflow.keras.utils import to_categorical

In [None]:
!wget --no-check certificate "https://drive.google.com/uc?export=download&id=1fYV-PPmnJMkW5m9T1WoinDfI7x4P3xDV" -O Fake.zip

In [None]:
!unzip Fake.zip

In [44]:
!wget --no-check certificate "https://drive.google.com/uc?export=download&id=1VcIG3ZwM1Ab6v9_yYISMPvsWxufL4g2I" -O True.zip

In [None]:
!unzip True.zip

* 1:True
* 0:False

In [19]:
true = pd.read_csv("True.csv")
true["target"] = [1]*true.shape[0]

In [20]:
fake = pd.read_csv("Fake.csv")
fake["target"] = [0]*fake.shape[0]

In [44]:
df = pd.concat([true, fake])

In [45]:
df = df.sample(frac=1).reset_index(drop=True)

In [46]:
df.shape[0]

44898

In [47]:
df.head()

Unnamed: 0,title,text,subject,date,target
0,FEEL THE BERN….How Hillary Walked Away From NH...,"First she won the coin toss in Iowa, and now H...",left-news,"Feb 10, 2016",0
1,OPEN BORDERS BERNIE Threatens Sheriff Arpaio F...,Just what America needs another President who ...,left-news,"Mar 20, 2016",0
2,UNHINGED LEFTIST Apologizes To “Refugees” Who ...,"Why stop there? Using liberal logic, shouldn t...",left-news,"Feb 13, 2016",0
3,Obama administration to announce efforts to bo...,DETROIT (Reuters) - The Obama administration w...,politicsNews,"January 13, 2016",1
4,Don Lemon Flipped Larry Wilmore The Bird Afte...,There are quite the number of mixed reviews co...,News,"May 1, 2016",0


# Fake News Classification

* [Fake and real news dataset | Kaggle](https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset)
* [Miachel Kazachok's Jigsaw Toxic Classification Kernel](https://www.kaggle.com/miklgr500/jigsaw-tpu-bert-with-huggingface-and-keras)


In [52]:
# !pip install transformers

In [87]:
import os
import warnings

import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from tokenizers import BertWordPieceTokenizer
import traitlets

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

warnings.simplefilter("ignore")

In [49]:
RANDOM_SEED = 42

In [50]:
sns.set_style("darkgrid")

In [51]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

## Model

In [52]:
def build_model(transformer, loss='binary_crossentropy', max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    # last hidden state : (batch_size, sequence_length, hidden_size)
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    x = Dropout(0.35)(cls_token)
    out = Dense(2, activation='softmax')(x)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=3e-5), loss=loss, metrics=[tf.keras.metrics.AUC()])
    
    return model

## TPU Configs

In [None]:
# Create strategy from tpu
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)

In [54]:
AUTO = tf.data.experimental.AUTOTUNE

## Fast Tokenizer

In [55]:
# First load the real tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

# Save the loaded tokenizer locally
save_path = 'distilbert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('distilbert_base_uncased/vocab.txt', lowercase=True)

## Train Test Split

In [85]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], 
                                                    test_size=0.2, shuffle=True,
                                                    random_state=RANDOM_SEED)

In [57]:
X_train.shape, y_train.shape

((35918,), (35918,))

In [58]:
X_test.shape, y_test.shape

((8980,), (8980,))

## Encode

In [59]:
X_train = fast_encode(X_train.astype(str), fast_tokenizer, maxlen=512)

HBox(children=(FloatProgress(value=0.0, max=141.0), HTML(value='')))




In [60]:
X_test = fast_encode(X_test.astype(str), fast_tokenizer, maxlen=512)

HBox(children=(FloatProgress(value=0.0, max=36.0), HTML(value='')))




## Tensorflow Datasets

In [61]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train, to_categorical(y_train)))
    .repeat()
    .shuffle(2048)
    .batch(64)
    .prefetch(AUTO)
)

In [62]:
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_test)
    .batch(64)
)

## Focal Loss

In [63]:
def focal_loss(gamma=2., alpha=.2):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

In [68]:
%%time
with strategy.scope():
    transformer_layer = transformers.TFBertModel.from_pretrained('bert-base-uncased')
    model = build_model(transformer_layer, loss=focal_loss(gamma=1.5), max_len=512)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_bert_model_1 (TFBertModel ((None, 512, 768), (None, 109482240 
_________________________________________________________________
tf_op_layer_strided_slice_1  [(None, 768)]             0         
_________________________________________________________________
dropout_75 (Dropout)         (None, 768)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
CPU times: user 12.2 s, sys: 4.91 s, total: 17.1 s
Wall time: 43.1 s


## LrScheduler

In [69]:
def build_lrfn(lr_start=0.000001, lr_max=0.000002, 
               lr_min=0.0000001, lr_rampup_epochs=7, 
               lr_sustain_epochs=0, lr_exp_decay=.87):
    lr_max = lr_max * strategy.num_replicas_in_sync

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    
    return lrfn

## Model Training

In [70]:
lrfn = build_lrfn()
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose=1)

train_history = model.fit(
    train_dataset,
    steps_per_epoch=500,
    callbacks=[lr_schedule],
    epochs=5
)


Epoch 00001: LearningRateScheduler reducing learning rate to 1e-06.
Epoch 1/5

















Epoch 00002: LearningRateScheduler reducing learning rate to 3.142857142857143e-06.
Epoch 2/5

Epoch 00003: LearningRateScheduler reducing learning rate to 5.285714285714285e-06.
Epoch 3/5

Epoch 00004: LearningRateScheduler reducing learning rate to 7.4285714285714275e-06.
Epoch 4/5

Epoch 00005: LearningRateScheduler reducing learning rate to 9.571428571428572e-06.
Epoch 5/5


In [80]:
y_pred = model.predict(test_dataset)

In [81]:
y_pred = np.argmax(y_pred, axis=1)

In [89]:
report_dict = classification_report(y_test, y_pred, output_dict=True)

In [90]:
pd.DataFrame(report_dict)

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.999789,0.999764,0.999777,0.999777,0.999777
recall,0.999789,0.999764,0.999777,0.999777,0.999777
f1-score,0.999789,0.999764,0.999777,0.999777,0.999777
support,4739.0,4241.0,0.999777,8980.0,8980.0


In [91]:
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred)}")

ROC AUC Score: 0.999776595786497
