In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, Bidirectional
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import os
import datetime

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/baseline_dataset.csv')

In [None]:
# Explore the balance of classes
sns.countplot(df['label'])
plt.show()

In [None]:
#get all the rows with the value 0 in their category column
df = df[df['category'] == 1]

# Examine dataset
print(df.head())
print(df.info())

# Print all column names in the DataFrame
print(df.columns)

# Explore text length distribution
df['text_length'] = df['claim'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(df['text_length'], bins=40, kde=True)
plt.show()

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Download the necessary NLTK resources
nltk.download('punkt')

# Tokenize using NLTK's tokenizer
tokens = df['claim'].apply(lambda x: word_tokenize(x))

# Flatten and find unique tokens
unique_words = set(token for sublist in tokens for token in sublist)

# Calculate vocabulary size
vocab_size = len(unique_words)

print(f"The vocabulary size of the dataset is: {vocab_size}")

In [None]:
# Prepare data for modeling
x = df['claim']
y = df['label']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)

my_tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
x_train = my_tfidf.fit_transform(x_train).toarray()
x_test = my_tfidf.transform(x_test).toarray()

In [None]:
# Setting up vocabulary size
voc_size = 24194 #Category 1 Voc Size

# One hot encoding
onehot_repr = [one_hot(text, voc_size) for text in df['claim']]

# Setting sentence length
sent_length = 500  # category 1 max length

# Padding the sentences
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)

In [None]:
# Plotting training & validation loss and accuracy
def plot_metrics(history):
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss Over Epochs')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy Over Epochs')
    plt.legend()

    plt.show()

In [None]:
# Confusion matrix and reports
def plot_confusion_matrix(cm, title='Confusion Matrix'):
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.show()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dropout, Dense
from tensorflow.keras.metrics import Precision, Recall

embedding_vector_features = 40

# Assuming 'embedded_docs' is your input data and 'df['label']' is your label data
X_final = np.array(embedded_docs)
y_final = np.array(df['label'])  # Replace 'label' with your target column name

# Train test split (common for both models)
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

# Setup Early Stopping
early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

# Model 1: Simple LSTM
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features))
model.add(Dropout(0.5))
model.add(LSTM(100))  # Adding 100 LSTM neurons
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

# TensorBoard callback setup
log_dir = os.path.join("logs", "fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Model training with history and TensorBoard
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[tensorboard_callback])
plot_metrics(history)

# Model 2: Bidirectional LSTM
model1 = Sequential()
model1.add(Embedding(voc_size, embedding_vector_features))
model1.add(Dropout(0.5))
model1.add(Bidirectional(LSTM(100)))
model1.add(Dropout(0.5))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])
print(model1.summary())

# TensorBoard callback setup
log_dir1 = os.path.join("logs", "fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback1 = TensorBoard(log_dir=log_dir1, histogram_freq=1)

# Model training with history and TensorBoard
history1 = model1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[tensorboard_callback1])
plot_metrics(history1)

# Define the GRU model
model_gru = Sequential()
model_gru.add(Embedding(input_dim=voc_size, output_dim=embedding_vector_features))
model_gru.add(GRU(100, return_sequences=True))  # First GRU layer with return_sequences=True to stack another GRU layer
model_gru.add(Dropout(0.5))  # Dropout to prevent overfitting
model_gru.add(GRU(64))  # Second GRU layer, no need for return_sequences as this is the last recurrent layer
model_gru.add(Dropout(0.5))  # Additional dropout for regularization
model_gru.add(Dense(64, activation='relu'))  # Dense layer after GRU layers
model_gru.add(Dense(1, activation='sigmoid'))  # Output layer, using sigmoid for binary classification
model_gru.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])

# Summary of the GRU model
print(model_gru.summary())

# TensorBoard callback setup
log_dir_gru = os.path.join("logs", "fit", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback_gru = TensorBoard(log_dir=log_dir_gru, histogram_freq=1)

# Model training with history and TensorBoard
history_gru = model_gru.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64, callbacks=[tensorboard_callback_gru])
plot_metrics(history1)

# Evaluate and compare both models using the test data
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_pred1 = (model1.predict(X_test) > 0.5).astype("int32")
y_pred_gru = (model_gru.predict(X_test) > 0.5).astype("int32")

cm = confusion_matrix(y_test, y_pred)
cm1 = confusion_matrix(y_test, y_pred1)
cm_gru = confusion_matrix(y_test, y_pred_gru)

plot_confusion_matrix(cm, 'Model Confusion Matrix')
print("Model Accuracy Score:", accuracy_score(y_test, y_pred))
print("Model Classification Report:")
print(classification_report(y_test, y_pred))

plot_confusion_matrix(cm1, 'Model Confusion Matrix')
print("Model 2 Accuracy Score:", accuracy_score(y_test, y_pred1))
print("Model 2 Classification Report:")
print(classification_report(y_test, y_pred1))

plot_confusion_matrix(cm_gru, 'Model Confusion Matrix')
print("Model 3 Accuracy Score:", accuracy_score(y_test, y_pred_gru))
print("Model 3 Classification Report:")
print(classification_report(y_test, y_pred_gru))