In [3]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB:
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import utils
from tensorflow.keras.layers import TextVectorization
# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import os
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

No GPU was detected. LSTMs and CNNs can be very slow without a GPU.


In [11]:
# csv to df
path = os.path.abspath('politifact_balanced_data.csv')
df_political = pd.read_csv(path, index_col=0)

# specify rows of importance
df_political = df_political[["veracity", "statement"]]

# See sample of data
df_political.head(5)

Unnamed: 0,veracity,statement
3,0,"""Tim Kaine doesn’t want a border at all. He wa..."
5,0,"""The deficit ... is coming down, and it’s comi..."
20,1,"""Migrant mother and ‘crying girl’ on Time cove..."
23,0,"""Fact: Over 90,000 kids were detained under Ob..."
24,0,"""$1 billion—that’s how much Bruce Rauner has w..."


In [12]:
path = os.path.abspath(r'C:\Users\catat\OneDrive\Desktop\DataForBachelorThesis\Real-lifeDeceptionDetection2016\Transcription\Truthful\transcripts.csv')
df_small = pd.read_csv(path, index_col=0)

# specify rows of importance
df_small = df_small[["veracity", "statement"]]

# See sample of data
df_small.head(5)

Unnamed: 0,veracity,statement
0,0,No sir I did not. I absolutely did not. No sir...
1,0,"... and she approached me, and at that time th..."
2,0,"No sir I was not, not at all."
3,0,"He had told me that he had had a dream that, a..."
4,0,"And he told me that, ammm â€¦ he was trying to..."


In [13]:
path = os.path.abspath('sevenDataset_cleaned.csv')
df_7t = pd.read_csv(path, index_col=0)

# specify rows of importance
df_7t = df_7t[["veracity", "statement"]]

# See sample of data
df_7t.head(5)

Unnamed: 0,veracity,statement
0,0,There is a great deal of truth to the anti-vax...
1,0,Jenny mccarthy is a learned doctor who deserve...
2,0,Driving doesn\t really require any practice.
3,0,Drinking and driving is a winning and safe com...
4,0,Good hygiene isn\t really important or attract...


# Check the average lenght of a statement from each df  before merging

In [16]:
political_avg_l = df_political['statement'].apply(len).mean()
small_avg_l = df_small['statement'].apply(len).mean()
sevenT_avg_l = df_7t['statement'].apply(len).mean()

In [18]:
print(political_avg_l)
print(small_avg_l)
print(sevenT_avg_l)

107.79260969976906
337.92561983471074
39.43523447401775


# For the smaller Trascripts dataset, the average lenght is way bigger. If we get poor results, we may drop this dataframe

In [23]:
df_united = pd.concat([df_political,df_small])
df_united = pd.concat([df_united, df_7t])

In [24]:
df_united.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6231 entries, 3 to 3944
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   6231 non-null   int64 
 1   statement  6231 non-null   object
dtypes: int64(1), object(1)
memory usage: 146.0+ KB


# Split the dataframe into train, validation and test

In [25]:
train=df_united.sample(frac=0.7,random_state=200) #random state is a seed value
df_validation = df_united.drop(train.index)
validation=df_validation.sample(frac=0.6,random_state=200)
test = df_validation.drop(validation.index)

In [26]:
train.info()
validation.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4985 entries, 1134 to 3804
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   4985 non-null   int64 
 1   statement  4985 non-null   object
dtypes: int64(1), object(1)
memory usage: 116.8+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 551 entries, 10072 to 3134
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   551 non-null    int64 
 1   statement  551 non-null    object
dtypes: int64(1), object(1)
memory usage: 12.9+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 354 entries, 123 to 3932
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   354 non-null    int64 
 1   statement  354 non-null    object
dtypes: int64(1), object(1)
memory usage: 8.3+ KB


In [33]:
train_avg_l = train['statement'].apply(len).mean()
valid_avg_l = validation['statement'].apply(len).mean()
test_avg_l = test['statement'].apply(len).mean()

In [34]:
#compute the average lenght from each dataset to ensure the random distribution of the data
print(train_avg_l)
print(valid_avg_l)
print(test_avg_l)

68.02908726178535
63.401088929219604
62.090395480225986


In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer

train_text = train['statement'].to_numpy()
tok = Tokenizer(oov_token='<unk>')
tok.fit_on_texts(train_text)
tok.word_index['<pad>'] = 0
tok.index_word[0] = '<pad>'

train_seqs = tok.texts_to_sequences(train_text)
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

train_labels = train['veracity'].to_numpy().flatten()

valid_text = validation['statement'].to_numpy()
valid_seqs = tok.texts_to_sequences(valid_text)
valid_seqs = tf.keras.preprocessing.sequence.pad_sequences(valid_seqs, padding='post')

valid_labels = validation['veracity'].to_numpy().flatten()



# CONVERT TO TF DATASETS

train_ds = tf.data.Dataset.from_tensor_slices((train_seqs,train_labels))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_seqs,valid_labels))

BATCH_SIZE = 32
BUFFER_SIZE = 7
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
valid_ds = valid_ds.batch(BATCH_SIZE)

# PREFETCH

train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
valid_ds = valid_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Designing a simple model without validation dataset

In [28]:
embed_size = 128
num_oov_buckets = 1000
vocab_size=10000
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:
history.history

{'loss': [0.6873899698257446,
  0.4861297905445099,
  0.24258652329444885,
  0.17198337614536285,
  0.14731137454509735],
 'accuracy': [0.538415253162384,
  0.7650952935218811,
  0.9031093120574951,
  0.9285857677459717,
  0.9422267079353333]}

In [30]:
test_text = test['statement'].to_numpy()
test_seqs = tok.texts_to_sequences(test_text)
test_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_seqs, padding='post')

test_labels = test['veracity'].to_numpy().flatten()

In [31]:
results = model.evaluate(test_seqs, test_labels, batch_size=32)



# Designing a simple model with validation dataset

In [38]:
embed_size = 128
num_oov_buckets = 1000
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

history = model.fit(
    train_ds,
    epochs=5,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [39]:
results = model.evaluate(test_seqs, test_labels, batch_size=32)



# Fine tuning the new model

In [40]:
#model that had the best results on the validation set
model2 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.4),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model2.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model2.fit(
    train_ds,
    epochs=6,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [49]:
#model that had the best results on the validation set
model4 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model4.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model4.fit(
    train_ds,
    epochs=5,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [45]:
#model that had the best results on the validation set
model3 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model3.compile(loss="binary_crossentropy", optimizer="nadam", metrics=["accuracy"])

history = model3.fit(
    train_ds,
    epochs=4,
    validation_data=valid_ds, validation_steps=1, steps_per_epoch=BUFFER_SIZE)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [46]:
results2 = model2.evaluate(test_seqs, test_labels, batch_size=32)



In [47]:
results3 = model3.evaluate(test_seqs, test_labels, batch_size=32)



In [50]:
results4 = model4.evaluate(test_seqs, test_labels, batch_size=32)



# Overall poor results. Applying Max norm is in order

In [53]:
embed_size = 128
num_oov_buckets = 1000
vocab_size=10000
model33 = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True, kernel_constraint=keras.constraints.max_norm(1.)),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128, return_sequences=True, kernel_constraint=keras.constraints.max_norm(1.)),
    keras.layers.Dropout(rate=0.2),
    keras.layers.GRU(128),
    keras.layers.Dropout(rate=0.2),
    keras.layers.Dense(1, activation="sigmoid")
])
model33.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model33.fit(train_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1c403930bb0>

In [54]:
restults3 = model33.evaluate(valid_seqs, valid_labels, batch_size=32)



In [55]:
restults3 = model33.evaluate(test_seqs, test_labels, batch_size=32)



# Train the models without the long outliers from the small df

In [57]:
df_united_no_small = pd.concat([df_political,df_7t])

In [58]:
train=df_united_no_small.sample(frac=0.7,random_state=200) #random state is a seed value
df_validation = df_united_no_small.drop(train.index)
validation=df_validation.sample(frac=0.6,random_state=200)
test = df_validation.drop(validation.index)

In [59]:
train.info()
validation.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4277 entries, 3905 to 2922
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   4277 non-null   int64 
 1   statement  4277 non-null   object
dtypes: int64(1), object(1)
memory usage: 100.2+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 894 entries, 2544 to 1468
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   894 non-null    int64 
 1   statement  894 non-null    object
dtypes: int64(1), object(1)
memory usage: 21.0+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 561 entries, 100 to 3919
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   veracity   561 non-null    int64 
 1   statement  561 non-null    object
dtypes: int64(1), object(1)
memory usage: 13.1+ KB


In [60]:
train_avg_l = train['statement'].apply(len).mean()
valid_avg_l = validation['statement'].apply(len).mean()
test_avg_l = test['statement'].apply(len).mean()

In [61]:
#compute the average lenght from each dataset to ensure the random distribution of the data
print(train_avg_l)
print(valid_avg_l)
print(test_avg_l)

63.805237315875615
62.93400447427293
58.105169340463455


In [63]:
from tensorflow.keras.preprocessing.text import Tokenizer

train_text = train['statement'].to_numpy()
tok = Tokenizer(oov_token='<unk>')
tok.fit_on_texts(train_text)
tok.word_index['<pad>'] = 0
tok.index_word[0] = '<pad>'

train_seqs = tok.texts_to_sequences(train_text)
train_seqs = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')

train_labels = train['veracity'].to_numpy().flatten()

valid_text = validation['statement'].to_numpy()
valid_seqs = tok.texts_to_sequences(valid_text)
valid_seqs = tf.keras.preprocessing.sequence.pad_sequences(valid_seqs, padding='post')

valid_labels = validation['veracity'].to_numpy().flatten()



# CONVERT TO TF DATASETS

train_ds = tf.data.Dataset.from_tensor_slices((train_seqs,train_labels))
valid_ds = tf.data.Dataset.from_tensor_slices((valid_seqs,valid_labels))

BATCH_SIZE = 32
BUFFER_SIZE = 7
train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
valid_ds = valid_ds.batch(BATCH_SIZE)

# PREFETCH

train_ds = train_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
valid_ds = valid_ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [64]:
embed_size = 128
num_oov_buckets = 1000
vocab_size=10000
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,
                           mask_zero=True, # not shown in the book
                           input_shape=[None]),
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(train_ds, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1c381ff1610>

In [65]:
test_text = test['statement'].to_numpy()
test_seqs = tok.texts_to_sequences(test_text)
test_seqs = tf.keras.preprocessing.sequence.pad_sequences(test_seqs, padding='post')

test_labels = test['veracity'].to_numpy().flatten()

In [66]:
results = model.evaluate(test_seqs, test_labels, batch_size=32)

