In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB:
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras.layers import TextVectorization
# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import os
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

No GPU was detected. LSTMs and CNNs can be very slow without a GPU.


In [2]:
# csv to df
path = os.path.abspath('politifact_balanced_data.csv')
df = pd.read_csv(path, index_col=0)

# specify rows of importance
df = df[["veracity", "statement"]]

# See sample of data
df.head(5)

Unnamed: 0,veracity,statement
3,0,"""Tim Kaine doesn’t want a border at all. He wa..."
5,0,"""The deficit ... is coming down, and it’s comi..."
20,1,"""Migrant mother and ‘crying girl’ on Time cove..."
23,0,"""Fact: Over 90,000 kids were detained under Ob..."
24,0,"""$1 billion—that’s how much Bruce Rauner has w..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2165 entries, 3 to 11183
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   2165 non-null   int64 
 1   statement  2165 non-null   object
dtypes: int64(1), object(1)
memory usage: 50.7+ KB


In [4]:
train=df.sample(frac=0.8,random_state=200) #random state is a seed value
df_validation = df.drop(train.index)
validation=df_validation.sample(frac=0.6,random_state=200)
test = df_validation.drop(validation.index)

In [5]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1732 entries, 698 to 4956
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   1732 non-null   int64 
 1   statement  1732 non-null   object
dtypes: int64(1), object(1)
memory usage: 40.6+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 173 entries, 70 to 11183
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   173 non-null    int64 
 1   statement  173 non-null    object
dtypes: int64(1), object(1)
memory usage: 4.1+ KB


# Transforming the statements into tensors

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

train_text = train['statement'].to_numpy()
tok = Tokenizer(oov_token='<unk>')
tok.fit_on_texts(train_text)
tok.word_index['<pad>'] = 0
tok.index_word[0] = '<pad>'

train_seqs = tok.texts_to_sequences(train_text)
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

train_labels = train['veracity'].to_numpy().flatten()


# CONVERT TO TF DATASETS

train_ds = tf.data.Dataset.from_tensor_slices((train_seqs,train_labels))

BATCH_SIZE = 32
BUFFER_SIZE = 7
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# PREFETCH

train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [7]:
embed_size = 128
num_oov_buckets = 1000
vocab_size=10000
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [8]:
history.history

{'loss': [0.6836201548576355,
  0.4578196406364441,
  0.18188466131687164,
  0.052647653967142105,
  0.031013986095786095],
 'accuracy': [0.550808310508728,
  0.7846420407295227,
  0.9341801404953003,
  0.9815242290496826,
  0.9901847839355469]}

In [9]:
test_text = test['statement'].to_numpy()
test_seqs = tok.texts_to_sequences(test_text)
test_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_seqs, padding='post')

test_labels = test['veracity'].to_numpy().flatten()

results = model.evaluate(test_seqs, test_labels, batch_size=32)



# Avoid Overfitting Through Regularization

In [19]:
train=df.sample(frac=0.65,random_state=200) #random state is a seed value
df_validation = df.drop(train.index)
validation=df_validation.sample(frac=0.6,random_state=200)
test = df_validation.drop(validation.index)

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer

train_text = train['statement'].to_numpy()
tok = Tokenizer(oov_token='<unk>')
tok.fit_on_texts(train_text)
tok.word_index['<pad>'] = 0
tok.index_word[0] = '<pad>'

train_seqs = tok.texts_to_sequences(train_text)
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

train_labels = train['veracity'].to_numpy().flatten()

valid_text = validation['statement'].to_numpy()
valid_seqs = tok.texts_to_sequences(valid_text)
valid_seqs = tf.keras.preprocessing.sequence.pad_sequences(valid_seqs, padding='post')

valid_labels = validation['veracity'].to_numpy().flatten()



# CONVERT TO TF DATASETS

train_ds = tf.data.Dataset.from_tensor_slices((train_seqs,train_labels))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_seqs,valid_labels))

BATCH_SIZE = 32
BUFFER_SIZE = 7
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
valid_ds = valid_ds.batch(BATCH_SIZE)

# PREFETCH

train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
valid_ds = valid_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [24]:
embed_size = 128
num_oov_buckets = 1000
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

history = model.fit(
    train_ds,
    epochs=5,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
from functools import partial



model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True, kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.GRU(128, kernel_regularizer=keras.regularizers.l2(0.01)),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model.fit(
    train_ds,
    epochs=5,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:
history1 = model.fit( train_ds,
    epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
history1.history

{'loss': [1.5064618587493896,
  0.7355286478996277,
  0.38134974241256714,
  0.22788342833518982,
  0.1176784411072731],
 'accuracy': [0.596304178237915,
  0.7562189102172852,
  0.8678038120269775,
  0.9317697286605835,
  0.9765458703041077]}

In [31]:
test_text = test['statement'].to_numpy()
test_seqs = tok.texts_to_sequences(test_text)
test_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_seqs, padding='post')

test_labels = test['veracity'].to_numpy().flatten()

In [34]:
results = model.evaluate(test_seqs, test_labels, batch_size=32)



# Adding l2 regularization acctually slightly improved the performance of the model

# Using Dropout 

In [45]:
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

history = model.fit(
    train_ds,
    epochs=7,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


# Using dropout er attained a respectable 68% accuracy on the validation data

In [46]:
results = model.evaluate(test_seqs, test_labels, batch_size=32)



# Tuning parameters in order to obtain better results

In [49]:
model1 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.4),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.3),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.3),
    keras.layers.GRU(70),
    keras.layers.Dropout(rate=0.1),
    keras.layers.Dense(1, activation="sigmoid")
])
model1.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model1.fit(
    train_ds,
    epochs=7,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


In [52]:
model1 = keras.models.Sequential([
    keras.layers.Embedding( + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.4),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.3),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.3),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.1),
    keras.layers.Dense(1, activation="sigmoid")
])
model1.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model1.fit(
    train_ds,
    epochs=6,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


#Try decreasing the vocab size

In [54]:
model2 = keras.models.Sequential([
    keras.layers.Embedding(6000 + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.4),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model2.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model2.fit(
    train_ds,
    epochs=6,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [56]:
model2 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.4),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model2.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model2.fit(
    train_ds,
    epochs=6,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


# Determine if increasing the number of layers improves performance overall

In [58]:
model3 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.4),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model3.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

history = model3.fit(
    train_ds,
    epochs=6,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [60]:
model4 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model4.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model4.fit(
    train_ds,
    epochs=6,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


## Model 2 has the most effective arhitecture so far, so we are going to improve on that 

In [63]:
model21 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.AlphaDropout(rate=0.4),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model21.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model21.fit(
    train_ds,
    epochs=6,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


# AlphaDropout does not add any improvement over simple Dropout

## adding a more complex optimizer

In [66]:
model22 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.AlphaDropout(rate=0.4),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
optimizer = keras.optimizers.SGD(learning_rate=0.02, momentum=0.8, nesterov=True)
model22.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

history = model22.fit(
    train_ds,
    epochs=6,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


# Adding Max Norm

In [67]:
from functools import partial

In [70]:
MaxNormGRU = partial(keras.layers.GRU,
                       return_sequences=True,
                       kernel_constraint=keras.constraints.max_norm(1.))

model24 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.4),
    MaxNormGRU(50),
    keras.layers.Dropout(rate=0.2),
    MaxNormGRU(50),
    keras.layers.Dropout(rate=0.2),
    MaxNormGRU(50),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(50),
    keras.layers.AlphaDropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])

model24.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model24.fit(
    train_ds,
    epochs=5,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [73]:
embed_size = 128
num_oov_buckets = 1000
vocab_size=10000
model30 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model30.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model30.fit(train_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [74]:
results = model30.evaluate(test_seqs, test_labels, batch_size=32)



In [80]:
results2 = model30.evaluate(valid_seqs, valid_labels, batch_size=10)



In [84]:
restults3 = model3.evaluate(test_seqs, test_labels, batch_size=18)



In [86]:
restults4 = model2.evaluate(valid_seqs, valid_labels, batch_size=10)



In [88]:
embed_size = 128
num_oov_buckets = 1000
vocab_size=10000
model33 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True, kernel_constraint=keras.constraints.max_norm(1.)),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True, kernel_constraint=keras.constraints.max_norm(1.)),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model33.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model33.fit(train_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [89]:
restults5 = model33.evaluate(test_seqs, test_labels, batch_size=18)



In [90]:
restults3 = model3.evaluate(valid_seqs, valid_labels, batch_size=32)

