In [None]:
#!pip install tensorflow

In [None]:
# Set up environment
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer as tf
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import csv
import pickle
import torch
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers, regularizers
import matplotlib.pyplot as plt

In [None]:
results = []

In [None]:
# Function to load and preprocess TSV data
def load_data(file_path):
    # Load the TSV file
    data = pd.read_csv(file_path, header=None)
    # Extract the statement and label
    texts = data[2].tolist()
    # Map labels to binary (1 for false/pants-fire, 0 for other)
    labels = data[1].apply(lambda x: 1 if x in ['false', 'pants-fire'] else 0).tolist()

    return texts, labels

# Load training, test, and validation sets
train_texts, train_labels = load_data('LiarTrain.csv')
test_texts, test_labels = load_data('LiarTest.csv')
valid_texts, valid_labels = load_data('LiarValid.csv')

# Combine train and validation data
texts = train_texts + valid_texts
labels = train_labels + valid_labels

# Split into training and testing sets
x_train, x_test, y_train, y_test = train_texts, test_texts, train_labels, test_labels

# Vectorize texts using TF-IDF with bigrams
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1, 2), stop_words='english')
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

Naive Bayes

In [None]:
# Naive Bayes Model
clf_nb = MultinomialNB()
print("Training Naive Bayes classifier...")
clf_nb.fit(x_train_tfidf, y_train)
nb_pred = clf_nb.predict(x_test_tfidf)

nb_accuracy = accuracy_score(y_test, nb_pred)
nb_precision = precision_score(y_test, nb_pred, average="weighted")
nb_recall = recall_score(y_test, nb_pred, average="weighted")
nb_f1 = f1_score(y_test, nb_pred, average="weighted")

results.append(["Naive Bayes", nb_accuracy, nb_precision, nb_recall, nb_f1])

Training Naive Bayes classifier...


Decision Tree

In [None]:
# Decision Tree Model
clf_dt = DecisionTreeClassifier(random_state=42)
print("Training Decision Tree classifier...")
clf_dt.fit(x_train_tfidf, y_train)
dt_pred = clf_dt.predict(x_test_tfidf)

dt_accuracy = accuracy_score(y_test, dt_pred)
dt_precision = precision_score(y_test, dt_pred, average="weighted")
dt_recall = recall_score(y_test, dt_pred, average="weighted")
dt_f1 = f1_score(y_test, dt_pred, average="weighted")

results.append(["Decision Tree", dt_accuracy, dt_precision, dt_recall, dt_f1])

Training Decision Tree classifier...


SMHA-CNN

In [None]:
def build_smha_cnn(vocab_size, embedding_dim):
    # Input layer
    inputs = tf.keras.Input(shape=(100,), dtype=tf.int32)  # Assuming 100 is the max sequence length

    # Embedding layer
    x = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(inputs)

    # CNN Layer
    x = layers.Conv1D(filters=128, kernel_size=5, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.Dropout(0.5)(x)


    # Self-Multi-Head Attention Layer
    attention = layers.MultiHeadAttention(num_heads=8, key_dim=embedding_dim)

    # Tensor reshaping to match attention layer format
    x = attention(x, x)

    # Global Max Pooling
    x = layers.GlobalMaxPooling1D()(x)

    # Fully connected layer
    x = layers.Dense(64, activation='relu')(x)

    # Output layer
    outputs = layers.Dense(1, activation='sigmoid')(x)

    # Build and compile the model
    model = tf.keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Tokenize texts for SMHA-CNN
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)
x_train_seq = tf.keras.preprocessing.sequence.pad_sequences(x_train_seq, maxlen=100)
x_test_seq = tf.keras.preprocessing.sequence.pad_sequences(x_test_seq, maxlen=100)

# Get the vocabulary size for embedding layer
vocab_size = len(tokenizer.word_index) + 1

# Convert numpy arrays to tensors before passing to the model
x_train_seq = tf.convert_to_tensor(x_train_seq, dtype=tf.int32)
x_test_seq = tf.convert_to_tensor(x_test_seq, dtype=tf.int32)
y_train = tf.convert_to_tensor(y_train, dtype=tf.int32)
y_test = tf.convert_to_tensor(y_test, dtype=tf.int32)

# Train the SMHA-CNN model
print("Training SMHA-CNN model...")
smha_cnn = build_smha_cnn(vocab_size, 128)
history = smha_cnn.fit(x_train_seq, y_train, epochs=4, batch_size=32, validation_data=(x_test_seq, y_test))
cnn_pred = (smha_cnn.predict(x_test_seq) > 0.5).astype("int32").flatten()

cnn_accuracy = accuracy_score(y_test, cnn_pred)
cnn_precision = precision_score(y_test, cnn_pred)
cnn_recall = recall_score(y_test, cnn_pred)
cnn_f1 = f1_score(y_test, cnn_pred)

results.append(["SMHA-CNN", cnn_accuracy, cnn_precision, cnn_recall, cnn_f1])

Training SMHA-CNN model...
Epoch 1/4
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 375ms/step - accuracy: 0.7136 - loss: 0.9071 - val_accuracy: 0.7309 - val_loss: 0.5710
Epoch 2/4
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 378ms/step - accuracy: 0.7236 - loss: 0.5674 - val_accuracy: 0.7364 - val_loss: 0.5789
Epoch 3/4
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 377ms/step - accuracy: 0.7732 - loss: 0.4796 - val_accuracy: 0.6922 - val_loss: 0.6174
Epoch 4/4
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 385ms/step - accuracy: 0.8248 - loss: 0.4113 - val_accuracy: 0.6772 - val_loss: 0.6378
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 94ms/step


Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train the Logistic Regression model
clf_lr = LogisticRegression(max_iter=1000)  # Increase max_iter if the model doesn't converge
print("Training Logistic Regression model...")
clf_lr.fit(x_train_tfidf, y_train)

# Predict on the test set
lr_pred = clf_lr.predict(x_test_tfidf)

# Evaluate the model
log_accuracy = accuracy_score(y_test, lr_pred)
log_precision = precision_score(y_test, lr_pred)
log_recall = recall_score(y_test, lr_pred)
log_f1 = f1_score(y_test, lr_pred)

results.append(["Logistic Reg", log_accuracy, log_precision, log_recall, log_f1])

Training Logistic Regression model...


XGBoost

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train the XGBoost classifier
clf_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf_xgb.fit(x_train_tfidf, y_train)

# Predict on the test set
xgb_pred = clf_xgb.predict(x_test_tfidf)

# Evaluate the model
XGBoost_Accuracy = accuracy_score(y_test, xgb_pred)
XGBoost_Precision = precision_score(y_test, xgb_pred)
XGBoost_Recall = recall_score(y_test, xgb_pred)
XGBoost_f1 = f1_score(y_test, xgb_pred)

results.append(["XGBoost", XGBoost_Accuracy, XGBoost_Precision, XGBoost_Recall, XGBoost_f1])

Parameters: { "use_label_encoder" } are not used.



CNN

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Tokenizer to convert text to sequences of integers
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(x_train)

# Convert text to sequences
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# Pad the sequences to a fixed length
x_train_seq = tf.keras.preprocessing.sequence.pad_sequences(x_train_seq, maxlen=100)
x_test_seq = tf.keras.preprocessing.sequence.pad_sequences(x_test_seq, maxlen=100)

# Convert labels to numpy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

def build_cnn(vocab_size, embedding_dim, input_length=100):
    model = models.Sequential()

    # Embedding layer
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length))

    # Add 1D Convolutional Layers
    model.add(layers.Conv1D(128, 5, activation='relu'))
    model.add(layers.MaxPooling1D(5))

    model.add(layers.Conv1D(128, 5, activation='relu'))
    model.add(layers.MaxPooling1D(5))

    # Flatten the output to connect it to a fully connected layer
    model.add(layers.Flatten())

    # Fully connected layer
    model.add(layers.Dense(64, activation='relu'))

    # Output layer
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Get the total number of unique words
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128

# Build the model
cnn_model = build_cnn(vocab_size, embedding_dim)

# Train the model
cnn_model.fit(x_train_seq, y_train, epochs=2, batch_size=32, validation_data=(x_test_seq, y_test))

# Predict the test data
cnn_pred = (cnn_model.predict(x_test_seq) > 0.5).astype("int32")

# Evaluate model performance
CNN_Accuracy = accuracy_score(y_test, cnn_pred)
CNN_Precision = precision_score(y_test, cnn_pred)
CNN_Recall = recall_score(y_test, cnn_pred)
CNN_f1 = f1_score(y_test, cnn_pred)

results.append(["CNN", CNN_Accuracy, CNN_Precision, CNN_Recall, CNN_f1])


Epoch 1/2




[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 89ms/step - accuracy: 0.7129 - loss: 0.5927 - val_accuracy: 0.7309 - val_loss: 0.5613
Epoch 2/2
[1m320/320[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 88ms/step - accuracy: 0.7548 - loss: 0.4887 - val_accuracy: 0.7167 - val_loss: 0.5962
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step


In [None]:
results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score"])
print("\nResults Table:")
print(results_df)


Results Table:
           Model  Accuracy  Precision    Recall  F1-Score
0    Naive Bayes  0.730860   0.670612  0.730860  0.627099
1  Decision Tree  0.640095   0.641426  0.640095  0.640754
2       SMHA-CNN  0.677190   0.357143  0.249267  0.293610
3   Logistic Reg  0.731650   0.512195  0.061584  0.109948
4        XGBoost  0.726914   0.460317  0.085044  0.143564
5            CNN  0.716654   0.456311  0.275660  0.343693
