In [None]:
#!pip install tensorflow

In [None]:
# Set up environment
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer as tf
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import csv
import pickle
import torch
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, regularizers
import matplotlib.pyplot as plt

In [None]:
# Function to load and preprocess TSV data
def load_data(file_path):
    # Load the TSV file
    data = pd.read_csv(file_path, header=None)
    # Extract the statement and label
    texts = data[2].tolist()
    # Map labels to binary (1 for false/pants-fire, 0 for other)
    labels = data[1].apply(lambda x: 1 if x in ['false', 'pants-fire'] else 0).tolist()

    return texts, labels

# Load training, test, and validation sets
train_texts, train_labels = load_data('LiarTrain.csv')
test_texts, test_labels = load_data('LiarTest.csv')
valid_texts, valid_labels = load_data('LiarValid.csv')

# Combine train and validation data
texts = train_texts + valid_texts
labels = train_labels + valid_labels

# Split into training and testing sets
x_train, x_test, y_train, y_test = train_texts, test_texts, train_labels, test_labels

# Vectorize texts using TF-IDF with bigrams
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2), stop_words='english')
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

In [None]:
# Naive Bayes Model
clf_nb = MultinomialNB()
print("Training Naive Bayes classifier...")
clf_nb.fit(x_train_tfidf, y_train)
nb_pred = clf_nb.predict(x_test_tfidf)

print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_pred))

Training Naive Bayes classifier...
Naive Bayes Accuracy: 0.7308602999210734


In [None]:
# Decision Tree Model
clf_dt = DecisionTreeClassifier(random_state=42)
print("Training Decision Tree classifier...")
clf_dt.fit(x_train_tfidf, y_train)
dt_pred = clf_dt.predict(x_test_tfidf)

print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))

Training Decision Tree classifier...
Decision Tree Accuracy: 0.6400947119179163


In [None]:
def build_smha_cnn(vocab_size, embedding_dim):
    # Input layer
    inputs = tf.keras.Input(shape=(100,), dtype=tf.int32)  # Assuming 100 is the max sequence length

    # Embedding layer
    x = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(inputs)

    # CNN Layer
    x = layers.Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Dropout(0.5)(x)


    # Self-Multi-Head Attention Layer
    attention = layers.MultiHeadAttention(num_heads=8, key_dim=embedding_dim)

    # Tensor reshaping to match attention layer format
    x = attention(x, x)

    # Global Max Pooling
    x = layers.GlobalMaxPooling1D()(x)

    # Fully connected layer
    x = layers.Dense(64, activation='relu')(x)

    # Output layer
    outputs = layers.Dense(1, activation='sigmoid')(x)

    # Build and compile the model
    model = tf.keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Tokenize texts for SMHA-CNN
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)
x_train_seq = tf.keras.preprocessing.sequence.pad_sequences(x_train_seq, maxlen=100)
x_test_seq = tf.keras.preprocessing.sequence.pad_sequences(x_test_seq, maxlen=100)

# Get the vocabulary size for embedding layer
vocab_size = len(tokenizer.word_index) + 1

# Convert numpy arrays to tensors before passing to the model
x_train_seq = tf.convert_to_tensor(x_train_seq, dtype=tf.int32)
x_test_seq = tf.convert_to_tensor(x_test_seq, dtype=tf.int32)
y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)
y_test = tf.convert_to_tensor(y_test, dtype=tf.int32)

# Train the SMHA-CNN model
smha_cnn = build_smha_cnn(vocab_size, 128)
print("Training SMHA-CNN model...")
history = smha_cnn.fit(x_train_seq, y_train, epochs=1, batch_size=32, validation_data=(x_test_seq, y_test))
cnn_pred = (smha_cnn.predict(x_test_seq) > 0.5).astype("int32").flatten()


Training SMHA-CNN model...
Epoch 1/2
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 348ms/step - accuracy: 0.7166 - loss: 0.8901 - val_accuracy: 0.7309 - val_loss: 0.5828
Epoch 2/2
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 354ms/step - accuracy: 0.7216 - loss: 0.5918 - val_accuracy: 0.7309 - val_loss: 0.5773
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 84ms/step


In [None]:
# Load and preprocess data
def load_data(file_path):
    data = pd.read_csv(file_path, header=None)
    texts = data[2].tolist()
    labels = data[1].apply(lambda x: 1 if x in ['false', 'pants-fire'] else 0).tolist()
    return texts, labels

# Load datasets
train_texts, train_labels = load_data('LiarTrain.csv')
test_texts, test_labels = load_data('LiarTest.csv')
valid_texts, valid_labels = load_data('LiarValid.csv')

# Combine train and validation data for training
texts = train_texts + valid_texts
labels = train_labels + valid_labels

# Use BERT Tokenizer to preprocess text
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=100, return_tensors='pt')
    return encodings['input_ids'], encodings['attention_mask']

# Encode training and testing texts
x_train_ids, x_train_mask = encode_texts(train_texts)
x_test_ids, x_test_mask = encode_texts(test_texts)

# Load pre-trained BERT model to extract embeddings
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(input_ids, attention_mask):
    with torch.no_grad():
        outputs = bert_model(input_ids, attention_mask=attention_mask)
        # Use the token representation as the sentence embedding
        return outputs.last_hidden_state[:, 0, :].numpy()

# Get BERT embeddings for train and test sets
x_train_embeddings = get_bert_embeddings(x_train_ids, x_train_mask)
x_test_embeddings = get_bert_embeddings(x_test_ids, x_test_mask)

# Convert embeddings to tensors for TensorFlow models
x_train_embeddings = tf.convert_to_tensor(x_train_embeddings, dtype=tf.float32)
x_test_embeddings = tf.convert_to_tensor(x_test_embeddings, dtype=tf.float32)
y_train = tf.convert_to_tensor(train_labels, dtype=tf.int32)
y_test = tf.convert_to_tensor(test_labels, dtype=tf.int32)

# Update your SMHA-CNN model to use BERT embeddings
def build_smha_cnn(embedding_dim):
    inputs = tf.keras.Input(shape=(embedding_dim,))
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    attention = layers.MultiHeadAttention(num_heads=8, key_dim=embedding_dim)
    x = tf.expand_dims(x, axis=1)  # Expand dimensions to match attention layer input
    x = attention(x, x)
    x = layers.GlobalMaxPooling1D()(x)
    x = layers.Dense(64, activation='relu')(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = tf.keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train the updated SMHA-CNN model
smha_cnn = build_smha_cnn(embedding_dim=x_train_embeddings.shape[1])
print("Training SMHA-CNN model with BERT embeddings...")
history = smha_cnn.fit(x_train_embeddings, y_train, epochs=2, batch_size=32, validation_data=(x_test_embeddings, y_test))

# Evaluate the model
cnn_pred = (smha_cnn.predict(x_test_embeddings) > 0.5).astype("int32").flatten()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]