In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense, Bidirectional
from tensorflow.keras.metrics import Precision, Recall
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
# Load dataset
df = pd.read_csv('/kaggle/input/baseline_dataset.csv')

# Examine dataset
print(df.head())
print(df.info())

# Print all column names in the DataFrame
print(df.columns)

In [None]:
# Explore the balance of classes
sns.countplot(df['label'])
plt.show()

In [None]:
# Explore text length distribution
df['text_length'] = df['claim'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(df['text_length'], bins=40, kde=True)
plt.show()

In [None]:
# Prepare data for modeling
x = df['claim']
y = df['label']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4, shuffle=True)

my_tfidf = TfidfVectorizer()
x_train = my_tfidf.fit_transform(x_train).toarray()
x_test = my_tfidf.transform(x_test).toarray()

In [None]:
# Setting up vocabulary size
voc_size = 10000

# One hot encoding
onehot_repr = [one_hot(text, voc_size) for text in df['claim']]

# Setting sentence length
sent_length = 5000

# Padding the sentences
embedded_docs = pad_sequences(onehot_repr, padding='pre', maxlen=sent_length)


In [None]:
# learning rate scheduler to adjust the learning rate during training.
from keras.callbacks import LearningRateScheduler
import keras.backend as K

def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

callback = LearningRateScheduler(scheduler)

In [None]:
# Necessary imports
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense
from tensorflow.keras.metrics import Precision, Recall
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Embedding vector features
embedding_vector_features = 40

# Assuming 'embedded_docs' is your input data and 'df['label']' is your label data
X_final = np.array(embedded_docs)
y_final = np.array(df['label'])  # Replace 'label' with your target column name

# Train test split (common for both models)
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

# Model 1: Simple LSTM
model = Sequential()
model.add(Embedding(voc_size, embedding_vector_features))
model.add(LSTM(150, return_sequences=True))  # Increased units, return sequences
model.add(Dropout(0.3))
model.add(LSTM(100))  # Second LSTM layer
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', Precision(), Recall()])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=256, callbacks=[callback])

# Evaluate the model using the test data
y_pred = (model.predict(X_test) > 0.5).astype("int32")

# Function for plotting confusion matrix
def plot_confusion_matrix(cm):
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

cm = confusion_matrix(y_test, y_pred)
plot_confusion_matrix(cm)
print("Model 1 Accuracy Score:", accuracy_score(y_test, y_pred))
print("Model 1 Classification Report:")
print(classification_report(y_test, y_pred))