In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# This code will prepare the dataset using multiple embedding and model techniques:
# - TF-IDF + Dense
# - Word2Vec + LSTM
# - FastText + CNN
# - BERT embeddings + Transformer layers

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec, FastText
from transformers import BertTokenizer, BertModel
import nltk
from nltk.tokenize import word_tokenize
# nltk.download("punkt")

# Load data
df = pd.read_csv("/kaggle/input/truth-seeker-dataset-2023-truthseeker2023/TruthSeeker2023/Truth_Seeker_Model_Dataset.csv")

# Drop NA values
df.dropna(subset=["statement", "tweet", "BinaryNumTarget"], inplace=True)

# Combine relevant text fields
df["text"] = df["statement"] + " " + df["tweet"]
y = df["BinaryNumTarget"].astype(int).values

# ----------------------------- TF-IDF + DENSE NN -----------------------------
tfidf = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf.fit_transform(df["text"])

X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Convert to dense
X_tfidf_train_dense = X_tfidf_train.toarray()
X_tfidf_test_dense = X_tfidf_test.toarray()

class TfidfDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

class TfidfNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 2)
        )
    def forward(self, x):
        return self.model(x)

# Prepare TF-IDF DataLoader
train_tfidf_ds = TfidfDataset(X_tfidf_train_dense, y_tfidf_train)
test_tfidf_ds = TfidfDataset(X_tfidf_test_dense, y_tfidf_test)
train_tfidf_loader = DataLoader(train_tfidf_ds, batch_size=64, shuffle=True)
test_tfidf_loader = DataLoader(test_tfidf_ds, batch_size=64)

# Define Word2Vec + LSTM and FastText + CNN after this
# Generate word tokens for embedding models
df["tokens"] = df["text"].apply(lambda x: word_tokenize(x.lower()))

# Train Word2Vec and FastText models
w2v_model = Word2Vec(sentences=df["tokens"], vector_size=100, window=5, min_count=2, workers=4)
ft_model = FastText(sentences=df["tokens"], vector_size=100, window=5, min_count=2, workers=4)

# Embed sentences using Word2Vec
def embed_sent_w2v(tokens):
    vectors = [w2v_model.wv[word] for word in tokens if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_w2v = np.array([embed_sent_w2v(toks) for toks in df["tokens"]])
X_w2v_train, X_w2v_test, y_w2v_train, y_w2v_test = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

# Embed sentences using FastText
def embed_sent_ft(tokens):
    vectors = [ft_model.wv[word] for word in tokens if word in ft_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_ft = np.array([embed_sent_ft(toks) for toks in df["tokens"]])
X_ft_train, X_ft_test, y_ft_train, y_ft_test = train_test_split(X_ft, y, test_size=0.2, random_state=42)

# Now continue with model definitions for:
# - LSTM for Word2Vec embeddings
# - CNN for FastText embeddings
# - Transformer/BERT-based embeddings next

{
    "X_tfidf_shape": X_tfidf.shape,
    "X_w2v_shape": X_w2v.shape,
    "X_ft_shape": X_ft.shape,
    "sample_text": df["text"].iloc[0]
}



{'X_tfidf_shape': (134198, 10000),
 'X_w2v_shape': (134198, 100),
 'X_ft_shape': (134198, 100),
 'sample_text': 'End of eviction moratorium means millions of Americans could lose their housing in the middle of a pandemic. @POTUS Biden Blunders - 6 Month Update\n\nInflation, Delta mismanagement, COVID for kids, Abandoning Americans in Afghanistan, Arming the Taliban, S. Border crisis, Breaking job growth, Abuse of power (Many Exec Orders, $3.5T through Reconciliation, Eviction Moratorium)...what did I miss?'}

In [16]:
df['BinaryNumTarget'].value_counts()

BinaryNumTarget
1.0    68930
0.0    65268
Name: count, dtype: int64

# Word2vec + LSTM

In [7]:
from torch.utils.data import TensorDataset


class LSTMW2V(nn.Module):
    def __init__(self, input_dim=100, hidden_dim=64):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 2)
    
    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        return self.fc(h_n[-1])

# Convert X_w2v to 3D shape: (samples, sequence_len=1, embedding_dim)
X_w2v_train_3d = torch.tensor(X_w2v_train[:, np.newaxis, :], dtype=torch.float32)
X_w2v_test_3d = torch.tensor(X_w2v_test[:, np.newaxis, :], dtype=torch.float32)
y_w2v_train = torch.tensor(y_w2v_train, dtype=torch.long)
y_w2v_test = torch.tensor(y_w2v_test, dtype=torch.long)

train_w2v_ds = TensorDataset(X_w2v_train_3d, y_w2v_train)
test_w2v_ds = TensorDataset(X_w2v_test_3d, y_w2v_test)

train_w2v_loader = DataLoader(train_w2v_ds, batch_size=64, shuffle=True)
test_w2v_loader = DataLoader(test_w2v_ds, batch_size=64)

model_w2v = LSTMW2V()
optimizer = torch.optim.Adam(model_w2v.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Train loop
for epoch in range(50):
    model_w2v.train()
    total_loss = 0
    for xb, yb in train_w2v_loader:
        optimizer.zero_grad()
        pred = model_w2v(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")



  y_w2v_train = torch.tensor(y_w2v_train, dtype=torch.long)
  y_w2v_test = torch.tensor(y_w2v_test, dtype=torch.long)


Epoch 1, Loss: 336.8802
Epoch 2, Loss: 149.4038
Epoch 3, Loss: 103.3492
Epoch 4, Loss: 79.6817
Epoch 5, Loss: 64.7250
Epoch 6, Loss: 55.7825
Epoch 7, Loss: 46.5139
Epoch 8, Loss: 41.5372
Epoch 9, Loss: 36.2298
Epoch 10, Loss: 31.8420
Epoch 11, Loss: 28.9013
Epoch 12, Loss: 26.0275
Epoch 13, Loss: 22.8323
Epoch 14, Loss: 20.5549
Epoch 15, Loss: 18.8809
Epoch 16, Loss: 16.9253
Epoch 17, Loss: 14.8794
Epoch 18, Loss: 13.9634
Epoch 19, Loss: 12.8536
Epoch 20, Loss: 11.4497
Epoch 21, Loss: 10.2339
Epoch 22, Loss: 10.3583
Epoch 23, Loss: 8.8308
Epoch 24, Loss: 7.7048
Epoch 25, Loss: 6.7385
Epoch 26, Loss: 7.3439
Epoch 27, Loss: 6.0944
Epoch 28, Loss: 5.7073
Epoch 29, Loss: 5.6105
Epoch 30, Loss: 5.0560
Epoch 31, Loss: 4.4141
Epoch 32, Loss: 4.4664
Epoch 33, Loss: 3.5708
Epoch 34, Loss: 3.8614
Epoch 35, Loss: 3.9508
Epoch 36, Loss: 2.4993
Epoch 37, Loss: 4.6406
Epoch 38, Loss: 2.2884
Epoch 39, Loss: 2.9537
Epoch 40, Loss: 2.7268
Epoch 41, Loss: 2.5273
Epoch 42, Loss: 2.8547
Epoch 43, Loss: 2.

In [8]:
def evaluate_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for xb, yb in dataloader:
            outputs = model(xb)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == yb).sum().item()
            total += yb.size(0)
    return correct / total

accuracy = evaluate_model(model_w2v, test_w2v_loader)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 99.41%


# FastText + CNN

In [13]:
class CNNFT(nn.Module):
    def __init__(self, input_dim=100):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3)
        self.pool = nn.MaxPool1d(2)
        self.fc = nn.Linear(16 * 49, 2)  # Adjust based on conv+pool output size
    
    def forward(self, x):
        x = x.unsqueeze(1)  # Add channel dim -> (B, C=1, D)
        x = self.pool(torch.relu(self.conv1(x)))
        x = x.view(x.size(0), -1)
        return self.fc(x)

X_ft_train_tensor = torch.tensor(X_ft_train, dtype=torch.float32)
X_ft_test_tensor = torch.tensor(X_ft_test, dtype=torch.float32)
y_ft_train_tensor = torch.tensor(y_ft_train, dtype=torch.long)
y_ft_test_tensor = torch.tensor(y_ft_test, dtype=torch.long)

train_ft_ds = TensorDataset(X_ft_train_tensor, y_ft_train_tensor)
test_ft_ds = TensorDataset(X_ft_test_tensor, y_ft_test_tensor)

train_ft_loader = DataLoader(train_ft_ds, batch_size=64, shuffle=True)
test_ft_loader = DataLoader(test_ft_ds, batch_size=64)

model_ft = CNNFT()
optimizer = torch.optim.Adam(model_ft.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# Train loop (simple)
for epoch in range(5):
    model_ft.train()
    total_loss = 0
    for xb, yb in train_ft_loader:
        optimizer.zero_grad()
        pred = model_ft(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_ft_loader)
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")

Epoch 1 - Loss: 0.3603
Epoch 2 - Loss: 0.2865
Epoch 3 - Loss: 0.2738
Epoch 4 - Loss: 0.2654
Epoch 5 - Loss: 0.2615


In [14]:
def evaluate_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for xb, yb in dataloader:
            outputs = model(xb)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == yb).sum().item()
            total += yb.size(0)
    return correct / total

accuracy = evaluate_model(model_ft, test_ft_loader)
print(f"FastText + CNN Test Accuracy: {accuracy * 100:.2f}%")



FastText + CNN Test Accuracy: 89.56%


#  BERT (Transformer) from HuggingFace

In [None]:
from transformers import BertTokenizer, BertModel
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize data
train_texts, test_texts, train_labels, test_labels = train_test_split(df["text"].tolist(), y.tolist(), test_size=0.2)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_labels
})

test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": test_labels
})

# Define model
model_bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Trainer setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model_bert,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train BERT
trainer.train()
