In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/fake-or-real-the-impostor-hunt/data/train.csv
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0192/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0192/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0956/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0956/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0266/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0266/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0435/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0435/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_1054/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_1054/file_1.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0664/file_2.txt
/kaggle/input/fake-or-real-the-impostor-hunt/data/test/article_0664/fil

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
SciBERT → Residual Stacking Ensemble for "Fake or Real: The Impostor Hunt"

- Stage 1: trains multiple base models (SVMs, Averaged SVM, Torch NN)
- Stage 2: meta-learners (LogReg, RF, HistGB, Torch NN) trained on residuals
- Stage 3: higher-level meta-learners trained on Stage 2 residuals
- Picks best Stage 2 learner via CV (Accuracy)
- Picks best Stage 3 learner via CV (R^2 on residual-target2 probabilities)

Supportive additions (cuML stacking tips):
- Optional GPU models (cuML) are added if available (no-op otherwise)
- Extra engineered meta features from Level-1 OOF predictions:
  consensus(mean), confidence(std), min, max, median, q25, q75, entropy
- NEW: Optional Level-3 weighted blend of top-2 Stage-2 meta learners (toggle)
"""

import os
import re
import random
from typing import List, Tuple, Optional

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertModel

from sklearn.model_selection import cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, r2_score
from sklearn.base import BaseEstimator, ClassifierMixin

# ----------------------------- #
# Config
# ----------------------------- #

DATA_PREFIX = "/kaggle/input/fake-or-real-the-impostor-hunt/data/"
TRAIN_DIR = os.path.join(DATA_PREFIX, "train")
TEST_DIR = os.path.join(DATA_PREFIX, "test")
TRAIN_CSV = os.path.join(DATA_PREFIX, "train.csv")

MODEL_NAME = "allenai/scibert_scivocab_uncased"
BATCH_SIZE = 32
MAX_LEN = 512
DEVICE = "cpu"
SEED = 42

# Toggle: use Level-3 weighted blend of top-2 Stage-2 meta-learners
USE_WEIGHTED_L3 = True

EMB_CACHE_DIR = "./emb_cache"
os.makedirs(EMB_CACHE_DIR, exist_ok=True)
TRAIN_EMB_1_NPY = os.path.join(EMB_CACHE_DIR, "train_first_embeddings.npy")
TRAIN_EMB_2_NPY = os.path.join(EMB_CACHE_DIR, "train_second_embeddings.npy")
TEST_EMB_1_NPY = os.path.join(EMB_CACHE_DIR, "test_first_embeddings.npy")
TEST_EMB_2_NPY = os.path.join(EMB_CACHE_DIR, "test_second_embeddings.npy")

# ----------------------------- #
# Optional cuML (GPU) availability
# ----------------------------- #

def _try_import_cuml():
    try:
        import cuml
        from cuml.linear_model import LogisticRegression as cuLogReg
        from cuml.svm import SVC as cuSVC
        from cuml.ensemble import RandomForestClassifier as cuRF
        from cuml.neighbors import KNeighborsClassifier as cuKNN
        return {
            "ok": True,
            "cuLogReg": cuLogReg,
            "cuSVC": cuSVC,
            "cuRF": cuRF,
            "cuKNN": cuKNN
        }
    except Exception:
        return {"ok": False}

_CU = _try_import_cuml()

# ----------------------------- #
# Utilities
# ----------------------------- #

def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def safe_read_file(path: str) -> Optional[str]:
    try:
        with open(path, "r", encoding="utf-8") as f:
            return f.read().strip()
    except FileNotFoundError:
        return None
    except Exception:
        return None

def find_article_folders(parent: str) -> List[Tuple[int, str]]:
    out: List[Tuple[int, str]] = []
    for name in os.listdir(parent):
        m = re.match(r"article_(\d{4})$", name)
        if m:
            out.append((int(m.group(1)), name))
    out.sort(key=lambda x: x[0])
    return out

def load_pairs(data_folder: str) -> List[Tuple[int, str, str]]:
    pairs: List[Tuple[int, str, str]] = []
    folders = find_article_folders(data_folder)
    for art_id, folder in folders:
        f1 = safe_read_file(os.path.join(data_folder, folder, "file_1.txt"))
        f2 = safe_read_file(os.path.join(data_folder, folder, "file_2.txt"))
        if f1 is None or f2 is None:
            raise RuntimeError(f"Missing files for {folder}")
        pairs.append((art_id, f1, f2))
    return pairs

def load_labels(labels_csv: str) -> pd.DataFrame:
    return pd.read_csv(labels_csv)

# ----------------------------- #
# BERT Embeddings
# ----------------------------- #

class BertEmbedder:
    def __init__(self, model_name: str, device: str = "cpu", max_len: int = 512):
        self.device = torch.device(device)
        self.max_len = max_len
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name).to(self.device)
        self.model.eval()

    @torch.no_grad()
    def encode(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        all_batches = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i: i + batch_size]
            inputs = self.tokenizer(
                batch, return_tensors="pt", truncation=True, padding=True, max_length=self.max_len
            ).to(self.device)
            outputs = self.model(**inputs)
            token_emb = outputs.last_hidden_state
            mask = inputs["attention_mask"].unsqueeze(-1)
            masked_sum = (token_emb * mask).sum(dim=1)
            lengths = mask.sum(dim=1).clamp(min=1)
            mean_emb = masked_sum / lengths
            all_batches.append(mean_emb.detach().cpu().numpy())
        return np.concatenate(all_batches, axis=0)

# ----------------------------- #
# Dataset helpers
# ----------------------------- #

def build_all_embeddings(first_emb, second_emb, labels_df):
    X_list, y_list = [], []
    for i in range(len(labels_df)):
        real_id = labels_df.iloc[i].real_text_id
        X_list.append(first_emb[i]); y_list.append(1 if real_id == 1 else 0)
        X_list.append(second_emb[i]); y_list.append(1 if real_id == 2 else 0)
    return np.stack(X_list, axis=0), np.array(y_list, dtype=int)

# ----------------------------- #
# Kernels
# ----------------------------- #

def _rbf_kernel(X, Y, gamma=None):
    if gamma is None:
        gamma = 1.0 / max(1, X.shape[1])
    X_norm = (X * X).sum(axis=1)[:, None]
    Y_norm = (Y * Y).sum(axis=1)[None, :]
    K = X_norm + Y_norm - 2.0 * (X @ Y.T)
    return np.exp(-gamma * K)

def _sigmoid_kernel(X, Y, gamma=None, coef0=0.0):
    if gamma is None:
        gamma = 1.0 / max(1, X.shape[1])
    return np.tanh(gamma * (X @ Y.T) + coef0)

def kernel_rbf_plus_sigmoid_avg(X, Y):
    return 0.5 * (_rbf_kernel(X, Y) + _sigmoid_kernel(X, Y))

def kernel_rbf_plus_sigmoid_sum(X, Y):
    return _rbf_kernel(X, Y) + _sigmoid_kernel(X, Y)

# ----------------------------- #
# Torch NN
# ----------------------------- #

class TorchNN(BaseEstimator, ClassifierMixin):
    def __init__(self, hidden_dim=64, epochs=5, batch_size=128, lr=1e-3, random_state=SEED):
        self.hidden_dim = hidden_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr
        self.random_state = random_state
        self._model = None
        self.classes_ = np.array([0, 1], dtype=int)

    class _Net(nn.Module):
        def __init__(self, in_dim, hidden):
            super().__init__()
            self.seq = nn.Sequential(
                nn.Linear(in_dim, hidden),
                nn.ReLU(),
                nn.Linear(hidden, 1)
            )
        def forward(self, x):
            return self.seq(x).squeeze(-1)

    def fit(self, X, y):
        torch.manual_seed(self.random_state)
        self._model = self._Net(X.shape[1], self.hidden_dim)
        ds = TensorDataset(
            torch.tensor(X, dtype=torch.float32),
            torch.tensor(y, dtype=torch.float32)
        )
        dl = DataLoader(ds, batch_size=self.batch_size, shuffle=True)
        opt = torch.optim.Adam(self._model.parameters(), lr=self.lr)
        crit = nn.BCEWithLogitsLoss()
        self._model.train()
        for _ in range(self.epochs):
            for xb, yb in dl:
                opt.zero_grad()
                logit = self._model(xb)
                loss = crit(logit, yb)
                loss.backward()
                opt.step()
        return self

    def predict_proba(self, X):
        self._model.eval()
        xb = torch.tensor(X, dtype=torch.float32)
        with torch.no_grad():
            logit = self._model(xb)
            p1 = torch.sigmoid(logit).numpy()
        return np.stack([1 - p1, p1], axis=1)

    def decision_function(self, X):
        return self.predict_proba(X)[:, 1]

    def predict(self, X):
        return (self.decision_function(X) >= 0.5).astype(int)

# ----------------------------- #
# Meta features from Level-1 predictions
# ----------------------------- #

def _entropy(p, eps=1e-12):
    p = np.clip(p, eps, 1 - eps)
    return -(p * np.log(p) + (1 - p) * np.log(1 - p))

def build_meta_features(pred_matrix: np.ndarray) -> np.ndarray:
    mean = pred_matrix.mean(axis=1, keepdims=True)
    std = pred_matrix.std(axis=1, keepdims=True)
    mn = pred_matrix.min(axis=1, keepdims=True)
    mx = pred_matrix.max(axis=1, keepdims=True)
    med = np.median(pred_matrix, axis=1, keepdims=True)
    q25 = np.quantile(pred_matrix, 0.25, axis=1, keepdims=True)
    q75 = np.quantile(pred_matrix, 0.75, axis=1, keepdims=True)
    ent = _entropy(mean.squeeze(1)).reshape(-1, 1)
    return np.concatenate([pred_matrix, mean, std, mn, mx, med, q25, q75, ent], axis=1)

# ----------------------------- #
# Residual Stacking Classifier
# ----------------------------- #

class ResidualStackingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, random_state=SEED):
        self.random_state = random_state
        # Base models
        self.base_models = {
            "SVM-RBF": Pipeline([("scaler", StandardScaler()), ("clf", SVC(kernel="rbf", probability=True))]),
            "SVM-Sigmoid": Pipeline([("scaler", StandardScaler()), ("clf", SVC(kernel="sigmoid", probability=True))]),
            "Torch-NN": Pipeline([("scaler", StandardScaler()), ("clf", TorchNN(hidden_dim=64, epochs=3))]),
        }
        # Meta learners Stage 2
        self.meta_learners_stage2 = {
            "LogReg": LogisticRegression(max_iter=2000),
            "RF": RandomForestClassifier(n_estimators=200, random_state=SEED),
            "HistGB": HistGradientBoostingClassifier(random_state=SEED),
        }
        # Meta learners Stage 3
        self.meta_learners_stage3 = {
            "LogReg": LogisticRegression(max_iter=2000),
            "GB": GradientBoostingClassifier(random_state=SEED),
            "RF": RandomForestClassifier(n_estimators=300, random_state=SEED),
        }
        self.best_meta2 = None
        self.best_meta3 = None
        self.stage2_ranked = []
        self.fitted_models = {}

    def fit(self, X, y):
        # Stage 1
        oof_preds = np.zeros((X.shape[0], len(self.base_models)))
        for j, (name, pipe) in enumerate(self.base_models.items()):
            preds = cross_val_predict(pipe, X, y, cv=5, method="predict_proba")[:, 1]
            oof_preds[:, j] = preds
            self.fitted_models[name] = pipe.fit(X, y)
        oof_meta = build_meta_features(oof_preds)
        stage1_pred = (oof_preds.mean(axis=1) >= 0.5).astype(int)
        residual_target = (y != stage1_pred).astype(int)
        # Stage 2
        best_score2 = -1
        for name, meta in self.meta_learners_stage2.items():
            preds_meta = cross_val_predict(meta, oof_meta, residual_target, cv=5, method="predict")
            score = accuracy_score(residual_target, preds_meta)
            fit_model = meta.fit(oof_meta, residual_target)
            self.stage2_ranked.append((name, fit_model, score))
            if score > best_score2:
                best_score2 = score
                self.best_meta2 = (name, fit_model)
        self.stage2_ranked.sort(key=lambda t: t[2], reverse=True)
        print(f"[Stage2] Selected meta-learner: {self.best_meta2[0]} | Acc={best_score2:.4f}")
        # Stage 3
        stage2_residual_pred = self.best_meta2[1].predict(oof_meta)
        residual_target2 = (residual_target != stage2_residual_pred).astype(int)
        best_r2 = -1e9
        for name, meta in self.meta_learners_stage3.items():
            prob_pred = cross_val_predict(meta, oof_meta, residual_target2, cv=5, method="predict_proba")[:, 1]
            score_r2 = r2_score(residual_target2.astype(float), prob_pred.astype(float))
            if score_r2 > best_r2:
                best_r2 = score_r2
                self.best_meta3 = (name, meta.fit(oof_meta, residual_target2))
        print(f"[Stage3] Selected meta-learner by R^2: {self.best_meta3[0]} | R^2={best_r2:.4f}")
        return self

    def _base_pred_matrix(self, X) -> np.ndarray:
        cols = []
        for name, model in self.fitted_models.items():
            p = model.predict_proba(X)[:, 1]
            cols.append(p)
        return np.vstack(cols).T

    def predict(self, X):
        base_preds = self._base_pred_matrix(X)
        avg_base = base_preds.mean(axis=1)
        base_meta = build_meta_features(base_preds)
        residual_prob2 = self.best_meta2[1].predict_proba(base_meta)[:, 1]
        baseline = (avg_base >= 0.5).astype(int)
        stage2_fix = np.where(residual_prob2 > 0.5, 1 - baseline, baseline)
        residual_prob3 = self.best_meta3[1].predict_proba(base_meta)[:, 1]
        return np.where(residual_prob3 > 0.5, 1 - stage2_fix, stage2_fix)

    def decision_function(self, X):
        base_preds = self._base_pred_matrix(X)
        return base_preds.mean(axis=1)

# ----------------------------- #
# Main
# ----------------------------- #

def main():
    set_seed(SEED)
    train_labels = load_labels(TRAIN_CSV)
    train_pairs = load_pairs(TRAIN_DIR)
    test_pairs = load_pairs(TEST_DIR)

    id_to_train = {aid: (t1, t2) for (aid, t1, t2) in train_pairs}
    first_train_texts = [id_to_train[i][0] for i in train_labels["id"].values]
    second_train_texts = [id_to_train[i][1] for i in train_labels["id"].values]
    test_ids = [aid for (aid, _, _) in test_pairs]
    first_test_texts = [t1 for (_, t1, _) in test_pairs]
    second_test_texts = [t2 for (_, _, t2) in test_pairs]

    embedder = BertEmbedder(MODEL_NAME, device=DEVICE, max_len=MAX_LEN)

    def maybe_encode(texts, cache_path):
        if os.path.exists(cache_path):
            arr = np.load(cache_path)
            if arr.ndim == 2:
                return arr
        arr = embedder.encode(texts, batch_size=BATCH_SIZE)
        np.save(cache_path, arr)
        return arr

    first_train_emb = maybe_encode(first_train_texts, TRAIN_EMB_1_NPY)
    second_train_emb = maybe_encode(second_train_texts, TRAIN_EMB_2_NPY)
    first_test_emb = maybe_encode(first_test_texts, TEST_EMB_1_NPY)
    second_test_emb = maybe_encode(second_test_texts, TEST_EMB_2_NPY)

    X, y = build_all_embeddings(first_train_emb, second_train_emb, train_labels)

    clf = ResidualStackingClassifier().fit(X, y)
    s1, s2 = clf.decision_function(first_test_emb), clf.decision_function(second_test_emb)
    preds = np.where(s1 > s2, 1, 2)

    pd.DataFrame({"id": test_ids, "real_text_id": preds}).to_csv("submission.csv", index=False)
    print("[OK] Saved submission.csv")

if __name__ == "__main__":
    main()


2025-09-23 02:30:36.665507: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758594636.928670      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758594637.004520      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.11/dist-packages/numba_cuda/numba/cuda/cudadrv/driver.py", line 314, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting env

vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

[Stage2] Selected meta-learner: LogReg | Acc=0.8053
[Stage3] Selected meta-learner by R^2: LogReg | R^2=0.0346
[OK] Saved submission.csv
