In [None]:
import os
import zipfile
import pandas as pd
import shutil
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
base_dir = 'drive/MyDrive/Colab Notebooks/data_mining'
filename = 'liar_dataset.zip'

# Define file paths
zip_path = os.path.join(base_dir, filename)
extract_path = os.path.join(base_dir, "liar_dataset")

# Step 2: Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Dataset extracted to: {extract_path}")

In [None]:
# Load training, validation, and test datasets into DataFrames
train_path = os.path.join(extract_path, "train.tsv")
valid_path = os.path.join(extract_path, "valid.tsv")
test_path = os.path.join(extract_path, "test.tsv")

# Define column names based on the dataset structure
columns = [
    "id", "label", "statement", "subject", "speaker", "job",
    "state", "party", "barely_true", "false",
    "half_true", "mostly_true", "pants_on_fire", "context"
]


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset as TorchDataset
from transformers import AutoModel, AutoTokenizer
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load datasets
train_df = pd.read_csv(train_path, sep='\t', header=None, names=columns)
valid_df = pd.read_csv(valid_path, sep='\t', header=None, names=columns)
test_df = pd.read_csv(test_path, sep='\t', header=None, names=columns)

# Drop rows with missing "speaker" and "party".
train_df.dropna(subset=["speaker", "party"], inplace=True)
valid_df.dropna(subset=["speaker", "party"], inplace=True)
test_df.dropna(subset=["speaker", "party"], inplace=True)

# Map labels to numeric values
label_map = {"pants-fire": 0, "false": 1, "barely-true": 2, "half-true": 3, "mostly-true": 4, "true": 5}
train_df["label"] = train_df["label"].map(label_map)
valid_df["label"] = valid_df["label"].map(label_map)
test_df["label"] = test_df["label"].map(label_map)

# One-hot encode metadata ('speaker' and 'party')
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
train_metadata = encoder.fit_transform(train_df[["speaker", "party"]])
valid_metadata = encoder.transform(valid_df[["speaker", "party"]])
test_metadata = encoder.transform(test_df[["speaker", "party"]])

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_text(text):
    """Tokenizes input text and returns input_ids and attention_mask."""
    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    return encodings["input_ids"].squeeze(0), encodings["attention_mask"].squeeze(0)

# Tokenize statements
train_tokenized = train_df["statement"].apply(tokenize_text)
valid_tokenized = valid_df["statement"].apply(tokenize_text)
test_tokenized = test_df["statement"].apply(tokenize_text)

# Extract input_ids and attention_mask
train_df["input_ids"], train_df["attention_mask"] = zip(*train_tokenized)
valid_df["input_ids"], valid_df["attention_mask"] = zip(*valid_tokenized)
test_df["input_ids"], test_df["attention_mask"] = zip(*test_tokenized)

# Convert metadata and labels to lists
train_metadata = train_metadata.tolist()
valid_metadata = valid_metadata.tolist()
test_metadata = test_metadata.tolist()

train_labels = train_df["label"].tolist()
valid_labels = valid_df["label"].tolist()
test_labels = test_df["label"].tolist()

# Create dataset class
class HybridDataset(TorchDataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(item["attention_mask"], dtype=torch.long),
            "metadata": torch.tensor(item["metadata"], dtype=torch.float),
            "labels": torch.tensor(item["label"], dtype=torch.long)
        }

# Convert to dataset format
train_data = [{"input_ids": text, "attention_mask": mask, "metadata": meta, "label": label}
              for text, mask, meta, label in zip(train_df["input_ids"], train_df["attention_mask"], train_metadata, train_labels)]
valid_data = [{"input_ids": text, "attention_mask": mask, "metadata": meta, "label": label}
              for text, mask, meta, label in zip(valid_df["input_ids"], valid_df["attention_mask"], valid_metadata, valid_labels)]
test_data = [{"input_ids": text, "attention_mask": mask, "metadata": meta, "label": label}
             for text, mask, meta, label in zip(test_df["input_ids"], test_df["attention_mask"], test_metadata, test_labels)]

# Create PyTorch Dataset
train_dataset = HybridDataset(train_data)
valid_dataset = HybridDataset(valid_data)
test_dataset = HybridDataset(test_data)

# Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)



In [None]:
# Define Hybrid Model
class HybridBERTModel(nn.Module):
    def __init__(self, num_metadata_features, num_labels=6):
        super(HybridBERTModel, self).__init__()
        self.bert = AutoModel.from_pretrained("roberta-base")
        self.text_fc = nn.Linear(768, 256)
        self.metadata_fc = nn.Linear(num_metadata_features, 128)
        self.classifier = nn.Linear(256 + 128, num_labels)

    def forward(self, input_ids, attention_mask, metadata_features):
        text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        text_features = torch.relu(self.text_fc(text_output))
        metadata_features = torch.relu(self.metadata_fc(metadata_features))
        combined = torch.cat((text_features, metadata_features), dim=1)
        return self.classifier(combined)

# Initialize Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_metadata_features = len(train_metadata[0])
model = HybridBERTModel(num_metadata_features=num_metadata_features).to(device)

# Define Loss Function and Optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training Loop
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        metadata = batch["metadata"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask, metadata)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")



In [None]:
torch.save(model.state_dict(), os.path.join(base_dir, "hybrid_model.pth"))
print("Model saved as hybrid_model.pth")

In [None]:
# Evaluation Function
def evaluate(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            metadata = batch["metadata"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask, metadata)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="weighted")
    # One-hot encode labels for multi-class AUC
    all_labels_one_hot = label_binarize(all_labels, classes=np.arange(num_classes))

    try:
        auc_roc = roc_auc_score(all_labels_one_hot, all_probs, multi_class="ovo")
    except ValueError:
        auc_roc = float("nan")  # Handle case where only one class is predicted

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")
    print(f"AUC-ROC: {auc_roc:.4f}")

    return accuracy, precision, recall, f1, auc_roc, all_preds, all_labels

# Evaluate Model
print("Evaluating on Test Set...")
hybrid_results = evaluate(model, test_dataloader)

# Save Model

# torch.save(model.state_dict(), os.path.join(base_dir, "hybrid_model.pth"))
# print("Model saved as hybrid_model.pth")



In [None]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_map.keys(), yticklabels=label_map.keys())
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(title)
    plt.show()

plot_confusion_matrix(hybrid_results[4], hybrid_results[5], title="Confusion Matrix for Hybrid")

### Load saved model

In [None]:
class HybridBERTModel(nn.Module):
    def __init__(self, num_metadata_features, num_labels=6):
        super(HybridBERTModel, self).__init__()
        self.bert = AutoModel.from_pretrained("roberta-base")
        self.text_fc = nn.Linear(768, 256)
        self.metadata_fc = nn.Linear(num_metadata_features, 128)
        self.classifier = nn.Linear(256 + 128, num_labels)

    def forward(self, input_ids, attention_mask, metadata_features):
        text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        text_features = torch.relu(self.text_fc(text_output))
        metadata_features = torch.relu(self.metadata_fc(metadata_features))
        combined = torch.cat((text_features, metadata_features), dim=1)
        return self.classifier(combined)


In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model with correct metadata input size
# num_metadata_features = 10  # ⚠️ Update this based on your dataset's metadata size
num_metadata_features = 2933 #train_metadata.shape[1]  # Gets total one-hot encoded feature count
print(f"Metadata Feature Count: {num_metadata_features}")
saved_model = HybridBERTModel(num_metadata_features=num_metadata_features).to(device)

# Load weights
saved_model.load_state_dict(torch.load(os.path.join(base_dir,"hybrid_model.pth"), map_location=device))
saved_model.eval()  # Set model to evaluation mode

print("✅ Model reloaded successfully!")


In [None]:
# train_df['speaker'].nunique()+train_df['party'].nunique()

In [None]:
def predict(model, input_ids, attention_mask, metadata):
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids.to(device), attention_mask.to(device), metadata.to(device))
        predicted_label = torch.argmax(outputs, dim=1).cpu().numpy()
    return predicted_label

# Example: Test on a batch from test dataloader
for batch in test_dataloader:
    preds = predict(saved_model, batch["input_ids"], batch["attention_mask"], batch["metadata"])
    print("Predictions:", preds)
    break  # Only test on first batch

In [None]:
# !pip install shap

In [None]:
import shap

# Convert metadata to a PyTorch tensor
metadata_tensor = torch.tensor(test_metadata, dtype=torch.float).to(device)
model = saved_model

# Define SHAP explainer function
def model_predict(metadata):
    with torch.no_grad():
        metadata = torch.tensor(metadata, dtype=torch.float).to(device)
        metadata_features = torch.relu(model.metadata_fc(metadata))

        # ✅ Fix: Add dummy text features (256-dim of zeros)
        text_dummy = torch.zeros((metadata_features.shape[0], 256)).to(device)
        combined_features = torch.cat((text_dummy, metadata_features), dim=1)  # Now shape is [batch_size, 384]

        return model.classifier(combined_features).cpu().numpy()

# Initialize KernelExplainer (slower but works with any model)
explainer = shap.KernelExplainer(model_predict, shap.sample(metadata_tensor.cpu().numpy(), 50))  # Sample 50 rows for efficiency

# Compute SHAP values
shap_values = explainer.shap_values(metadata_tensor.cpu().numpy())


In [None]:
shap.summary_plot(shap_values, metadata_tensor.cpu().numpy(), feature_names=encoder.get_feature_names_out())

- Higher absolute SHAP values → Higher impact on predictions.
- Positive SHAP value → Increases probability of higher truthfulness.
- Negative SHAP value → Decreases probability of higher truthfulness.
- Low SHAP values → Feature has little effect.

In [None]:
# shap.force_plot(explainer.expected_value, shap_values[0], metadata_tensor.cpu().numpy()[0], feature_names=encoder.get_feature_names_out())
# shap.plots.force(explainer.expected_value, shap_values[0], metadata_tensor.cpu().numpy()[0])
shap.plots.force(explainer.expected_value[0], shap_values[..., 0], metadata_tensor.cpu().numpy())



### with Monte-Carlo dropout

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel

class HybridBERTModelMC(nn.Module):
    def __init__(self, num_metadata_features, num_labels=6, dropout_prob=0.3):
        super(HybridBERTModelMC, self).__init__()
        self.bert = AutoModel.from_pretrained("roberta-base")
        self.text_fc = nn.Linear(768, 256)
        self.metadata_fc = nn.Linear(num_metadata_features, 128)
        self.dropout = nn.Dropout(dropout_prob)  # 🔥 Dropout before classification
        self.classifier = nn.Linear(256 + 128, num_labels)

    def forward(self, input_ids, attention_mask, metadata_features):
        text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        text_features = torch.relu(self.text_fc(text_output))
        metadata_features = torch.relu(self.metadata_fc(metadata_features))
        combined = torch.cat((text_features, metadata_features), dim=1)
        combined = self.dropout(combined)  # 🔥 Apply dropout even in inference
        return self.classifier(combined)


✔ Runs multiple stochastic forward passes (num_samples times).

✔ Returns multiple predictions and probability distributions.

In [None]:
def mc_dropout_predict(model, dataloader, num_samples=30):
    model.train()  # 🔥 Keep dropout active!

    all_preds = []
    all_probs = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            metadata = batch["metadata"].to(device)

            # Collect multiple predictions per sample
            batch_preds = []
            batch_probs = []

            for _ in range(num_samples):
                outputs = model(input_ids, attention_mask, metadata)
                probs = torch.softmax(outputs, dim=1).cpu().numpy()
                preds = torch.argmax(outputs, dim=1).cpu().numpy()

                batch_preds.append(preds)
                batch_probs.append(probs)

            all_preds.append(np.array(batch_preds))  # Shape: [num_samples, batch_size]
            all_probs.append(np.array(batch_probs))  # Shape: [num_samples, batch_size, num_classes]

    all_preds = np.concatenate(all_preds, axis=1)  # Shape: [num_samples, total_samples]
    all_probs = np.concatenate(all_probs, axis=1)  # Shape: [num_samples, total_samples, num_classes]

    return all_preds, all_probs  # Return multiple predictions and probability distributions


✔ Higher variance = Model is unsure about the prediction.

✔ Lower confidence = More ambiguous classification.

In [None]:
import numpy as np

def compute_uncertainty(mc_preds, mc_probs):
    mean_preds = np.mean(mc_preds, axis=0)  # Majority vote
    uncertainty = np.var(mc_probs, axis=0).mean(axis=1)  # Variance of probabilities per sample
    return mean_preds, uncertainty


In [None]:
print("Running MC Dropout inference...")
mc_preds, mc_probs = mc_dropout_predict(model, test_dataloader, num_samples=30)

# Compute uncertainty
mean_preds, uncertainty_scores = compute_uncertainty(mc_preds, mc_probs)

# Display uncertainty
for i in range(10):  # Print first 10 predictions
    print(f"Sample {i}: Predicted Label: {mean_preds[i]}, Uncertainty: {uncertainty_scores[i]:.4f}")



🚀 Summary of Monte Carlo Dropout

- Modify the model → Add dropout before classification.
- Enable dropout during inference → model.train() mode.
- Run multiple stochastic passes → Collect multiple predictions.
- Measure uncertainty → Compute variance & confidence scores.
- Use uncertainty-aware predictions → Flag unreliable predictions.