In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, Bidirectional
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/baseline_dataset.csv')

# Examine dataset
print(df.head())
print(df.info())

# Print all column names in the DataFrame
print(df.columns)

In [None]:
# Use memory-efficient types for model inputs
df['label'] = df['label'].astype(np.uint8)
df['category'] = df['category'].astype(np.uint8)

In [None]:
# Explore the balance of classes
sns.countplot(df['label'])
plt.show()

In [None]:
# Explore text length distribution
df['text_length'] = df['claim'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(df['text_length'], bins=40, kde=True)
plt.show()

In [None]:
# Prepare data for modeling
x = df['claim']
y = df['label']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)

my_tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
x_train = my_tfidf.fit_transform(x_train).toarray()
x_test = my_tfidf.transform(x_test).toarray()

In [None]:
# learning rate scheduler to adjust the learning rate during training.
from keras.callbacks import LearningRateScheduler
import keras.backend as K

# Learning rate scheduler
def scheduler(epoch, lr):
    return lr * 0.95 if epoch > 1 else lr

callback = LearningRateScheduler(scheduler)
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

In [None]:
# Necessary imports
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Model 2: Bidirectional LSTM
model1 = Sequential()
model1.add(Embedding(input_dim=5000, output_dim=50))
model1.add(Bidirectional(LSTM(128, return_sequences=True)))  # Increased units, return sequences
model1.add(Dropout(0.5))
model1.add(Bidirectional(LSTM(64)))  # Second Bidirectional LSTM layer
model1.add(Dropout(0.5))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])
print(model1.summary())
model1.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=20, batch_size=128, callbacks=[callback, early_stopping])

# Evaluate the model using the test data
y_pred1 = (model1.predict(x_test) > 0.5).astype("int")

# Function for plotting confusion matrix
def plot_confusion_matrix(cm):
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

cm1 = confusion_matrix(y_test, y_pred1)
plot_confusion_matrix(cm1)
print("Model 2 Accuracy Score:", accuracy_score(y_test, y_pred1))
print("Model 2 Classification Report:")
print(classification_report(y_test, y_pred1))