# Ensemble Model

In [1]:
import json
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

import json
import random
from collections import defaultdict, Counter
import itertools

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
import torch.nn.functional as F

import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        return self.model(x).squeeze()

In [3]:
class CNNTextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, num_filters=100, filter_sizes=(3, 4, 5), dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=num_filters,
                      kernel_size=fs)
            for fs in filter_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(filter_sizes), 1)

    def forward(self, x):
        x = self.embedding(x)           # (batch_size, seq_len, emb_dim)
        x = x.permute(0, 2, 1)          # (batch_size, emb_dim, seq_len)
        convs = [F.relu(conv(x)) for conv in self.convs]  # list of (batch_size, num_filters, *)
        pooled = [F.max_pool1d(c, kernel_size=c.size(2)).squeeze(2) for c in convs]
        cat = torch.cat(pooled, dim=1)  # (batch_size, num_filters * len(filter_sizes))
        dropped = self.dropout(cat)
        logits = self.fc(dropped)
        return logits.squeeze()

In [4]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x):
        embeds = self.embedding(x)
        _, (h_n, _) = self.lstm(embeds)
        forward_final = h_n[-2]
        backward_final = h_n[-1]
        last_hidden = torch.cat((forward_final, backward_final), dim=1)
        logits = self.fc(self.dropout(last_hidden))
        return logits.squeeze()

In [5]:
class MetaNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(9, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

In [6]:
def tokens_to_text(token_lists: List[List[int]]) -> List[str]:
    return [" ".join(map(str, tokens)) for tokens in token_lists]

In [7]:
def load_json_lines(path):
    texts, labels = [], []
    with open(path, 'r') as f:
        for line in f:
            ex = json.loads(line)
            texts.append(torch.tensor(ex['text'], dtype=torch.long))
            labels.append(ex['label'])
    return texts, labels

def load_test_json(path):
    texts = []
    with open(path, 'r') as f:
        for line in f:
            ex = json.loads(line)
            texts.append(torch.tensor(ex['text'], dtype=torch.long))
    return texts


In [8]:
def truncate_and_pad_sequences(sequences: List[torch.Tensor], max_len: int, padding_value: int = 0) -> torch.Tensor:
    """Truncate and pad a list of token sequences to the same max length."""
    truncated = [seq[:max_len] for seq in sequences]
    return pad_sequence(truncated, batch_first=True, padding_value=padding_value)

def prepare_tensor_sequences(raw_seqs: List[List[int]], max_len: int) -> torch.Tensor:
    token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]
    return truncate_and_pad_sequences(token_tensors, max_len)

In [9]:
def compute_embed_stats(token_lists: List[List[int]], embedding_matrix: np.ndarray) -> np.ndarray:
    stats = []
    for tokens in token_lists:
        embedded = np.array([embedding_matrix[t] for t in tokens if t < len(embedding_matrix)])
        if embedded.size == 0:
            mean = np.zeros(embedding_matrix.shape[1])
            std = np.zeros(embedding_matrix.shape[1])
        else:
            mean = embedded.mean(axis=0)
            std = embedded.std(axis=0)
        stats.append(np.concatenate([mean, std]))
    return np.vstack(stats)
from typing import List

In [10]:
def create_weighted_sampler(domains, labels):
    from collections import defaultdict
    from torch.utils.data import WeightedRandomSampler

    # Group by (domain, label)
    group_to_indices = defaultdict(list)
    for i, (d, l) in enumerate(zip(domains, labels)):
        group_to_indices[(d, l)].append(i)

    # Find the max group size
    max_group_size = max(len(v) for v in group_to_indices.values())

    # Create weights so each group contributes equally
    weights = [0] * len(labels)
    for group, indices in group_to_indices.items():
        group_weight = max_group_size / len(indices)
        for i in indices:
            weights[i] = group_weight

    sample_weights = torch.DoubleTensor(weights)
    sampler = WeightedRandomSampler(sample_weights, len(labels), replacement=True)
    return sampler

In [11]:
def stratified_train_val_split(texts, labels, domains, val_size_per_group=60, random_state=42):
    random.seed(random_state)
    from collections import defaultdict
    buckets = defaultdict(list)

    for x, y, d in zip(texts, labels, domains):
        buckets[(d, y)].append((x, y, d))

    train, val = [], []
    for key in buckets:
        group = buckets[key]
        random.shuffle(group)
        n_val = min(val_size_per_group, len(group))
        val.extend(group[:n_val])
        train.extend(group[n_val:])

    random.shuffle(train)
    random.shuffle(val)
    tx, ty, td = zip(*train)
    vx, vy, vd = zip(*val)
    return list(tx), list(ty), list(td), list(vx), list(vy), list(vd)

In [12]:
def stratified_kfold_by_domain_label(texts, labels, domains, n_splits=5, random_state=42):
    """
    Custom stratified K-Fold by (domain, label) pair.
    Returns list of (train_indices, val_indices) for each fold.
    """
    rng = np.random.default_rng(random_state)
    buckets = defaultdict(list)

    for idx, (d, l) in enumerate(zip(domains, labels)):
        buckets[(d, l)].append(idx)

    for key in buckets:
        rng.shuffle(buckets[key])

    folds = [[] for _ in range(n_splits)]
    for key, idxs in buckets.items():
        for i, idx in enumerate(idxs):
            folds[i % n_splits].append(idx)

    fold_indices = []
    for i in range(n_splits):
        val_idx = folds[i]
        train_idx = [idx for j in range(n_splits) if j != i for idx in folds[j]]
        fold_indices.append((train_idx, val_idx))

    return fold_indices

In [13]:
def train(model, train_loader, epochs, lr, device="cpu"):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    pos_weight = torch.tensor([
        sum(1 for y in train_loader.dataset.tensors[1] if y == 0) /
        sum(1 for y in train_loader.dataset.tensors[1] if y == 1)
    ], device=device)
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch}", leave=False):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch} — Train Loss: {total_loss / len(train_loader):.4f}")

In [14]:
def predict_with_confidence(model, dataloader, device="cpu"):
    model.eval()
    preds, probs = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting", leave=False):
            if len(batch) == 2:
                X_batch, _ = batch  # Ignore labels
            else:
                X_batch = batch[0]
            X_batch = X_batch.to(device)
            logits = model(X_batch)
            batch_probs = torch.sigmoid(logits).squeeze()

            if batch_probs.ndim == 0:
                batch_probs = batch_probs.unsqueeze(0)

            batch_preds = (batch_probs >= 0.5).int().tolist()
            preds.extend(batch_preds)
            probs.extend(batch_probs.cpu().tolist())

    return preds, probs

In [16]:
# === Step 0: Load and Split Data ===
d1_texts, d1_labels = load_json_lines("data/domain1_train_data.json")
d2_texts, d2_labels = load_json_lines("data/domain2_train_data.json")
texts = d1_texts + d2_texts
labels = d1_labels + d2_labels
domains = [0] * len(d1_labels) + [1] * len(d2_labels)
MAX_LEN = 2048
num_folds = 10

test_texts_raw = load_test_json("data/test_data.json")
test_texts = prepare_tensor_sequences(test_texts_raw, MAX_LEN)

test_dataset_cnn = TensorDataset(test_texts)
test_loader_cnn = DataLoader(test_dataset_cnn, batch_size=32)

# === Step 6: Generate Out-of-Fold Predictions for Meta-Classifier ===
meta_inputs = []
meta_targets = []
bilstm_val_preds_all = []
cnn_val_preds_all = []
mlp_val_preds_all = []

fold_indices = stratified_kfold_by_domain_label(texts, labels, domains, n_splits=10)
for fold, (train_idx, val_idx) in enumerate(fold_indices):
    print(f"[Meta Training] Fold {fold + 1}/{num_folds}")
    fold_train_texts = [texts[i] for i in train_idx]
    fold_val_texts = [texts[i] for i in val_idx]
    fold_train_labels = [labels[i] for i in train_idx]
    fold_val_labels = [labels[i] for i in val_idx]

    # Prepare padded CNN input
    fold_train_padded = prepare_tensor_sequences(fold_train_texts, MAX_LEN)
    fold_val_padded = prepare_tensor_sequences(fold_val_texts, MAX_LEN)

    # CNN
    cnn_model = CNNTextClassifier(
        vocab_size=17120,
        embedding_dim=128,
        num_filters=50,
        filter_sizes=(2, 3, 4),
        dropout=0.5
    ).to(device)
    fold_train_domains = [domains[i] for i in train_idx]
    cnn_loader = DataLoader(
        TensorDataset(fold_train_padded, torch.tensor(fold_train_labels, dtype=torch.float32)),
        batch_size=32,
        sampler=create_weighted_sampler(fold_train_domains, fold_train_labels)
    )
    train(cnn_model, cnn_loader, epochs=4, lr=0.001, device=device)

    val_loader_cnn = DataLoader(TensorDataset(fold_val_padded), batch_size=32)
    cnn_probs = np.array(predict_with_confidence(cnn_model, val_loader_cnn, device=device)[1])

    # BiLSTM
    bilstm_model = BiLSTMClassifier(
        vocab_size=17120,
        hidden_dim=128,
        num_layers=1,
        dropout=0.5
    ).to(device)
    bilstm_loader = DataLoader(
        TensorDataset(fold_train_padded, torch.tensor(fold_train_labels, dtype=torch.float32)),
        batch_size=32,
        sampler=create_weighted_sampler(fold_train_domains, fold_train_labels)
    )
    train(bilstm_model, bilstm_loader, epochs=5, lr=0.0005, device=device)

    val_loader_bilstm = DataLoader(TensorDataset(fold_val_padded), batch_size=32)
    bilstm_probs = np.array(predict_with_confidence(bilstm_model, val_loader_bilstm, device=device)[1])
    
    #MLP
    fold_val_strings = tokens_to_text(fold_val_texts)
    fold_train_strings = tokens_to_text(fold_train_texts)

    tfidf = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf.fit_transform(fold_train_strings).toarray()
    X_val_tfidf = tfidf.transform(fold_val_strings).toarray()

    svd = TruncatedSVD(n_components=100, random_state=42)
    X_train_svd = svd.fit_transform(X_train_tfidf)
    X_val_svd = svd.transform(X_val_tfidf)

    X_train_raw = np.hstack([X_train_svd])
    X_val_raw = np.hstack([X_val_svd])

    scaler = StandardScaler()
    X_train_std = scaler.fit_transform(X_train_raw)
    X_val_std = scaler.transform(X_val_raw)

    mlp_model = MLPClassifier(input_dim=X_train_std.shape[1], hidden_dim=256, dropout=0.5).to(device)
    train_loader_mlp = DataLoader(
        TensorDataset(torch.tensor(X_train_std, dtype=torch.float32), torch.tensor(fold_train_labels, dtype=torch.float32)),
        batch_size=64,
        sampler=create_weighted_sampler(fold_train_domains, fold_train_labels)
    )
    train(mlp_model, train_loader_mlp, epochs=6, lr=1e-3, device=device)

    val_loader_mlp = DataLoader(TensorDataset(torch.tensor(X_val_std, dtype=torch.float32)), batch_size=64)
    mlp_probs = np.array(predict_with_confidence(mlp_model, val_loader_mlp, device=device)[1])

    fold_meta = np.vstack([
        cnn_probs,
        (cnn_probs >= 0.5).astype(float),
        np.abs(cnn_probs - 0.5),
        mlp_probs,
        (mlp_probs >= 0.5).astype(float),
        np.abs(mlp_probs - 0.5),
        bilstm_probs,
        (bilstm_probs >= 0.5).astype(float),
        np.abs(bilstm_probs - 0.5)
    ]).T
    bilstm_val_preds_all.append((bilstm_probs >= 0.5).astype(int))
    cnn_val_preds_all.append((cnn_probs >= 0.5).astype(int))
    mlp_val_preds_all.append((mlp_probs >= 0.5).astype(int))
    meta_inputs.append(fold_meta)
    meta_targets.extend(fold_val_labels)

X_meta_val = np.vstack(meta_inputs)
y_meta_val = np.array(meta_targets)

  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 1/10


                                                          

Epoch 1 — Train Loss: 0.1696


                                                          

Epoch 2 — Train Loss: 0.0894


                                                          

Epoch 3 — Train Loss: 0.0602


                                                          

Epoch 4 — Train Loss: 0.0452


                                                           

Epoch 1 — Train Loss: 0.2115


                                                          

Epoch 2 — Train Loss: 0.1148


                                                          

Epoch 3 — Train Loss: 0.0607


                                                          

Epoch 4 — Train Loss: 0.0297


                                                          

Epoch 5 — Train Loss: 0.0214


                                                           

Epoch 1 — Train Loss: 0.1782


                                               

Epoch 2 — Train Loss: 0.0998


                                               

Epoch 3 — Train Loss: 0.0800


                                               

Epoch 4 — Train Loss: 0.0638


                                               

Epoch 5 — Train Loss: 0.0531


                                               

Epoch 6 — Train Loss: 0.0440


  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 2/10


                                                          

Epoch 1 — Train Loss: 0.1956


                                                          

Epoch 2 — Train Loss: 0.1008


                                                          

Epoch 3 — Train Loss: 0.0696


                                                          

Epoch 4 — Train Loss: 0.0476


                                                           

Epoch 1 — Train Loss: 0.1996


                                                          

Epoch 2 — Train Loss: 0.0931


                                                          

Epoch 3 — Train Loss: 0.0558


                                                          

Epoch 4 — Train Loss: 0.0352


                                                          

Epoch 5 — Train Loss: 0.0213


                                                           

Epoch 1 — Train Loss: 0.1846


                                               

Epoch 2 — Train Loss: 0.1043


                                               

Epoch 3 — Train Loss: 0.0799


                                               

Epoch 4 — Train Loss: 0.0698


                                               

Epoch 5 — Train Loss: 0.0564


                                               

Epoch 6 — Train Loss: 0.0483


  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 3/10


                                                          

Epoch 1 — Train Loss: 0.1617


                                                          

Epoch 2 — Train Loss: 0.0888


                                                          

Epoch 3 — Train Loss: 0.0601


                                                          

Epoch 4 — Train Loss: 0.0454


                                                           

Epoch 1 — Train Loss: 0.2052


                                                          

Epoch 2 — Train Loss: 0.1121


                                                          

Epoch 3 — Train Loss: 0.0624


                                                          

Epoch 4 — Train Loss: 0.0375


                                                          

Epoch 5 — Train Loss: 0.0213


                                                           

Epoch 1 — Train Loss: 0.1784


                                               

Epoch 2 — Train Loss: 0.0968


                                               

Epoch 3 — Train Loss: 0.0745


                                               

Epoch 4 — Train Loss: 0.0589


                                               

Epoch 5 — Train Loss: 0.0529


                                               

Epoch 6 — Train Loss: 0.0407


  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 4/10


                                                          

Epoch 1 — Train Loss: 0.1835


                                                          

Epoch 2 — Train Loss: 0.0965


                                                          

Epoch 3 — Train Loss: 0.0633


                                                          

Epoch 4 — Train Loss: 0.0447


                                                           

Epoch 1 — Train Loss: 0.2022


                                                          

Epoch 2 — Train Loss: 0.1027


                                                          

Epoch 3 — Train Loss: 0.0542


                                                          

Epoch 4 — Train Loss: 0.0298


                                                          

Epoch 5 — Train Loss: 0.0234


                                                           

Epoch 1 — Train Loss: 0.1782


                                               

Epoch 2 — Train Loss: 0.1030


                                               

Epoch 3 — Train Loss: 0.0773


                                               

Epoch 4 — Train Loss: 0.0599


                                               

Epoch 5 — Train Loss: 0.0520


                                               

Epoch 6 — Train Loss: 0.0426


  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 5/10


                                                          

Epoch 1 — Train Loss: 0.1786


                                                          

Epoch 2 — Train Loss: 0.0955


                                                          

Epoch 3 — Train Loss: 0.0659


                                                          

Epoch 4 — Train Loss: 0.0495


                                                           

Epoch 1 — Train Loss: 0.1937


                                                          

Epoch 2 — Train Loss: 0.0833


                                                          

Epoch 3 — Train Loss: 0.0417


                                                          

Epoch 4 — Train Loss: 0.0235


                                                          

Epoch 5 — Train Loss: 0.0093


                                                           

Epoch 1 — Train Loss: 0.1807


                                               

Epoch 2 — Train Loss: 0.1020


                                               

Epoch 3 — Train Loss: 0.0765


                                               

Epoch 4 — Train Loss: 0.0609


                                               

Epoch 5 — Train Loss: 0.0530


                                               

Epoch 6 — Train Loss: 0.0454


  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 6/10


                                                          

Epoch 1 — Train Loss: 0.1807


                                                          

Epoch 2 — Train Loss: 0.0932


                                                          

Epoch 3 — Train Loss: 0.0673


                                                          

Epoch 4 — Train Loss: 0.0448


                                                           

Epoch 1 — Train Loss: 0.2024


                                                          

Epoch 2 — Train Loss: 0.1037


                                                          

Epoch 3 — Train Loss: 0.0603


                                                          

Epoch 4 — Train Loss: 0.0300


                                                          

Epoch 5 — Train Loss: 0.0314


                                                           

Epoch 1 — Train Loss: 0.1835


                                               

Epoch 2 — Train Loss: 0.0994


                                               

Epoch 3 — Train Loss: 0.0762


                                               

Epoch 4 — Train Loss: 0.0628


                                               

Epoch 5 — Train Loss: 0.0525


                                               

Epoch 6 — Train Loss: 0.0450


  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 7/10


                                                          

Epoch 1 — Train Loss: 0.1729


                                                          

Epoch 2 — Train Loss: 0.0946


                                                          

Epoch 3 — Train Loss: 0.0626


                                                          

Epoch 4 — Train Loss: 0.0506


                                                           

Epoch 1 — Train Loss: 0.1923


                                                          

Epoch 2 — Train Loss: 0.1112


                                                          

Epoch 3 — Train Loss: 0.0579


                                                          

Epoch 4 — Train Loss: 0.0321


                                                          

Epoch 5 — Train Loss: 0.0187


                                                           

Epoch 1 — Train Loss: 0.1848


                                               

Epoch 2 — Train Loss: 0.1050


                                               

Epoch 3 — Train Loss: 0.0734


                                               

Epoch 4 — Train Loss: 0.0599


                                               

Epoch 5 — Train Loss: 0.0534


                                               

Epoch 6 — Train Loss: 0.0443


  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 8/10


                                                          

Epoch 1 — Train Loss: 0.1546


                                                          

Epoch 2 — Train Loss: 0.0843


                                                          

Epoch 3 — Train Loss: 0.0566


                                                          

Epoch 4 — Train Loss: 0.0434


                                                           

Epoch 1 — Train Loss: 0.2023


                                                          

Epoch 2 — Train Loss: 0.1114


                                                          

Epoch 3 — Train Loss: 0.0557


                                                          

Epoch 4 — Train Loss: 0.0348


                                                          

Epoch 5 — Train Loss: 0.0220


                                                           

Epoch 1 — Train Loss: 0.1736


                                               

Epoch 2 — Train Loss: 0.1018


                                               

Epoch 3 — Train Loss: 0.0757


                                               

Epoch 4 — Train Loss: 0.0631


                                               

Epoch 5 — Train Loss: 0.0539


                                               

Epoch 6 — Train Loss: 0.0445


  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 9/10


                                                          

Epoch 1 — Train Loss: 0.1819


                                                          

Epoch 2 — Train Loss: 0.0967


                                                          

Epoch 3 — Train Loss: 0.0668


                                                          

Epoch 4 — Train Loss: 0.0465


                                                           

Epoch 1 — Train Loss: 0.2078


                                                          

Epoch 2 — Train Loss: 0.1089


                                                          

Epoch 3 — Train Loss: 0.0561


                                                          

Epoch 4 — Train Loss: 0.0413


                                                          

Epoch 5 — Train Loss: 0.0225


                                                           

Epoch 1 — Train Loss: 0.1749


                                               

Epoch 2 — Train Loss: 0.0982


                                               

Epoch 3 — Train Loss: 0.0755


                                               

Epoch 4 — Train Loss: 0.0621


                                               

Epoch 5 — Train Loss: 0.0538


                                               

Epoch 6 — Train Loss: 0.0427


  token_tensors = [torch.tensor(seq, dtype=torch.long) for seq in raw_seqs]


[Meta Training] Fold 10/10


                                                          

Epoch 1 — Train Loss: 0.2102


                                                          

Epoch 2 — Train Loss: 0.1126


                                                          

Epoch 3 — Train Loss: 0.0736


                                                          

Epoch 4 — Train Loss: 0.0526


                                                           

Epoch 1 — Train Loss: 0.2102


                                                          

Epoch 2 — Train Loss: 0.1054


                                                          

Epoch 3 — Train Loss: 0.0535


                                                          

Epoch 4 — Train Loss: 0.0254


                                                          

Epoch 5 — Train Loss: 0.0309


                                                           

Epoch 1 — Train Loss: 0.1805


                                               

Epoch 2 — Train Loss: 0.1020


                                               

Epoch 3 — Train Loss: 0.0764


                                               

Epoch 4 — Train Loss: 0.0653


                                               

Epoch 5 — Train Loss: 0.0558


                                               

Epoch 6 — Train Loss: 0.0473


                                                  

In [17]:
# === Step 7a: Train and Validate MetaNN ===
from sklearn.metrics import accuracy_score

# Create stratified split for meta-training
meta_domains = [domains[i] for _, val_idx in fold_indices for i in val_idx]
meta_texts = list(X_meta_val)
meta_labels = list(y_meta_val)

meta_train_texts, meta_train_labels, meta_train_domains, meta_val_texts, meta_val_labels, meta_val_domains = stratified_train_val_split(
    meta_texts, meta_labels, meta_domains, val_size_per_group=120, random_state=42
)
meta_val_domains = np.array(meta_val_domains)

X_meta_train_tensor = torch.tensor(meta_train_texts, dtype=torch.float32).to(device)
y_meta_train_tensor = torch.tensor(meta_train_labels, dtype=torch.float32).to(device)
X_meta_valid_tensor = torch.tensor(meta_val_texts, dtype=torch.float32).to(device)
y_meta_valid_tensor = torch.tensor(meta_val_labels, dtype=torch.float32).to(device)

# Define MetaNN
meta_nn = MetaNN().to(device)
optimizer_meta = torch.optim.Adam(meta_nn.parameters(), lr=1e-3)
pos_weight = torch.tensor([(y_meta_train_tensor == 0).sum() / (y_meta_train_tensor == 1).sum()], device=device)
criterion_meta = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

#meta_sampler = create_weighted_sampler(meta_train_domains, meta_train_labels)
#meta_loader = DataLoader(
 #   TensorDataset(X_meta_train_tensor, y_meta_train_tensor),
  #  batch_size=64,
   # sampler=meta_sampler
#)

meta_loader = DataLoader(TensorDataset(X_meta_train_tensor, y_meta_train_tensor), batch_size=64, shuffle=True)

# Early stopping
best_val_acc = 0
patience = 5
patience_counter = 0

for epoch in range(1, 51):
    meta_nn.train()
    total_loss = 0
    for X_batch, y_batch in meta_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer_meta.zero_grad()
        logits = meta_nn(X_batch)
        loss = criterion_meta(logits, y_batch)
        loss.backward()
        optimizer_meta.step()
        total_loss += loss.item()

    meta_nn.eval()
    with torch.no_grad():
        val_logits = meta_nn(X_meta_valid_tensor)
        val_probs = torch.sigmoid(val_logits).cpu().numpy()
        val_preds = (val_probs >= 0.5).astype(int)
        val_acc = accuracy_score(y_meta_valid_tensor.cpu().numpy(), val_preds)

    print(f"[MetaNN] Epoch {epoch} — Train Loss: {total_loss / len(meta_loader):.4f} — Val Acc: {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state = meta_nn.state_dict()
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping.")
            break

meta_nn.load_state_dict(best_state)

# Evaluate MetaNN against early stopping validation split
meta_nn.eval()
with torch.no_grad():
    val_logits = meta_nn(X_meta_valid_tensor)
    val_probs = torch.sigmoid(val_logits).cpu().numpy()
    val_preds = (val_probs >= 0.5).astype(int)

meta_val_acc = accuracy_score(np.array(meta_val_labels), val_preds)
print(f"\nMetaNN Overall Validation Accuracy: {meta_val_acc:.4f}")

print("\nMetaNN Validation Accuracy by Domain:")
for domain_id in [0, 1]:
    mask = (meta_val_domains == domain_id)
    domain_acc = accuracy_score(np.array(meta_val_labels)[mask], val_preds[mask])
    print(f"  Domain {domain_id}: {domain_acc:.4f}")

# Evaluate base models by domain using full out-of-fold meta-validation set
cnn_val_preds = np.concatenate(cnn_val_preds_all)
mlp_val_preds = np.concatenate(mlp_val_preds_all)
bilstm_val_preds = np.concatenate(bilstm_val_preds_all)
full_meta_val_domains = np.array([domains[i] for _, val_idx in fold_indices for i in val_idx])
y_meta_val = np.array(meta_labels)

print("\nBase Model Overall Validation Accuracy:")
print("  CNN Accuracy:", accuracy_score(y_meta_val, cnn_val_preds))
print("  BiLSTM Accuracy:", accuracy_score(y_meta_val, bilstm_val_preds))
print("  MLP Accuracy:", accuracy_score(y_meta_val, mlp_val_preds))

print("\nBase Model Validation Accuracy by Domain:")
for domain_id in [0, 1]:
    mask = (full_meta_val_domains == domain_id)
    print(f"  Domain {domain_id}:")
    print("    CNN Accuracy:", accuracy_score(y_meta_val[mask], cnn_val_preds[mask]))
    print("    BiLSTM Accuracy:", accuracy_score(y_meta_val[mask], bilstm_val_preds[mask]))
    print("    MLP Accuracy:", accuracy_score(y_meta_val[mask], mlp_val_preds[mask]))

  X_meta_train_tensor = torch.tensor(meta_train_texts, dtype=torch.float32).to(device)


[MetaNN] Epoch 1 — Train Loss: 0.1189 — Val Acc: 0.8396
[MetaNN] Epoch 2 — Train Loss: 0.0886 — Val Acc: 0.8562
[MetaNN] Epoch 3 — Train Loss: 0.0638 — Val Acc: 0.9146
[MetaNN] Epoch 4 — Train Loss: 0.0532 — Val Acc: 0.9146
[MetaNN] Epoch 5 — Train Loss: 0.0475 — Val Acc: 0.9167
[MetaNN] Epoch 6 — Train Loss: 0.0463 — Val Acc: 0.9167
[MetaNN] Epoch 7 — Train Loss: 0.0474 — Val Acc: 0.9167
[MetaNN] Epoch 8 — Train Loss: 0.0466 — Val Acc: 0.9167
[MetaNN] Epoch 9 — Train Loss: 0.0451 — Val Acc: 0.9187
[MetaNN] Epoch 10 — Train Loss: 0.0468 — Val Acc: 0.9187
[MetaNN] Epoch 11 — Train Loss: 0.0455 — Val Acc: 0.9187
[MetaNN] Epoch 12 — Train Loss: 0.0485 — Val Acc: 0.9187
[MetaNN] Epoch 13 — Train Loss: 0.0471 — Val Acc: 0.9187
[MetaNN] Epoch 14 — Train Loss: 0.0470 — Val Acc: 0.9187
Early stopping.

MetaNN Overall Validation Accuracy: 0.9187

MetaNN Validation Accuracy by Domain:
  Domain 0: 0.9167
  Domain 1: 0.9208

Base Model Overall Validation Accuracy:
  CNN Accuracy: 0.873333333333333

In [18]:
# === Step 7: Retrain Models on Full Data for Final Ensemble ===

# === Train MetaNN on Full Meta-Training Set ===
meta_nn = MetaNN().to(device)
optimizer_meta = torch.optim.Adam(meta_nn.parameters(), lr=1e-3)

pos_weight = torch.tensor([
    (y_meta_val == 0).sum() / (y_meta_val == 1).sum()
], device=device)
criterion_meta = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

X_meta_val_tensor = torch.tensor(X_meta_val, dtype=torch.float32).to(device)
y_meta_val_tensor = torch.tensor(y_meta_val, dtype=torch.float32).to(device)

#meta_sampler = create_weighted_sampler(meta_domains, y_meta_val)
#meta_loader = DataLoader(
   # TensorDataset(X_meta_val_tensor, y_meta_val_tensor),
  #  batch_size=64,
 #   sampler=meta_sampler
#)

meta_loader = DataLoader(TensorDataset(X_meta_train_tensor, y_meta_train_tensor), batch_size=64, shuffle=True)


for epoch in range(1, 11):
    total_loss = 0.0
    meta_nn.train()
    for X_batch, y_batch in meta_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer_meta.zero_grad()
        logits = meta_nn(X_batch)
        loss = criterion_meta(logits, y_batch)
        loss.backward()
        optimizer_meta.step()
        total_loss += loss.item()
    print(f"[MetaNN] Epoch {epoch} — Train Loss: {total_loss / len(meta_loader):.4f}")

full_texts = texts
full_labels = labels
full_domains = domains

# === Full MLP Training ===
full_text_strings = tokens_to_text(full_texts)
X_full_tfidf = tfidf_vectorizer.fit_transform(full_text_strings).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_text_strings).toarray()

X_full_svd = svd.fit_transform(X_full_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

X_full_raw = np.hstack([X_full_svd])
X_test_raw = np.hstack([X_test_svd])

X_full_std = scaler.fit_transform(X_full_raw)
X_test_std = scaler.transform(X_test_raw)

X_full_tensor = torch.tensor(X_full_std, dtype=torch.float32)
y_full_tensor = torch.tensor(full_labels, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_std, dtype=torch.float32)

full_dataset_mlp = TensorDataset(X_full_tensor, y_full_tensor)
full_loader_mlp = DataLoader(full_dataset_mlp, batch_size=64, sampler=create_weighted_sampler(full_domains, full_labels))
test_loader_mlp = DataLoader(TensorDataset(X_test_tensor), batch_size=64)

mlp_model = MLPClassifier(input_dim=X_full_tensor.shape[1], hidden_dim=256, dropout=0.5).to(device)
train(mlp_model, full_loader_mlp, epochs=6, lr=1e-3, device=device)

# === Full BiLSTM Training ===
full_texts_padded = prepare_tensor_sequences(full_texts, MAX_LEN)
full_dataset_bilstm = TensorDataset(full_texts_padded, torch.tensor(full_labels, dtype=torch.float32))
full_loader_bilstm = DataLoader(full_dataset_bilstm, batch_size=32, sampler=create_weighted_sampler(full_domains, full_labels))

bilstm_model = BiLSTMClassifier(
    vocab_size=17120,
    hidden_dim=128,
    num_layers=1,
    dropout=0.5
).to(device)
train(bilstm_model, full_loader_bilstm, epochs=5, lr=0.0005, device=device)

# === Full CNN Training ===
full_texts_padded = prepare_tensor_sequences(full_texts, MAX_LEN)
full_dataset_cnn = TensorDataset(full_texts_padded, torch.tensor(full_labels, dtype=torch.float32))
full_loader_cnn = DataLoader(full_dataset_cnn, batch_size=32, sampler=create_weighted_sampler(full_domains, full_labels))

cnn_model = CNNTextClassifier(
    vocab_size=17120,
    embedding_dim=128,
    num_filters=50,
    filter_sizes=(2, 3, 4),
    dropout=0.5
).to(device)
train(cnn_model, full_loader_cnn, epochs=4, lr=0.001, device=device)

test_dataset_cnn = TensorDataset(test_texts)
test_loader_cnn = DataLoader(test_dataset_cnn, batch_size=32)

# === Step 8: Predict on Test and Use Meta-Classifier ===
bilstm_test_preds, bilstm_test_probs = predict_with_confidence(bilstm_model, test_loader_cnn, device=device)
mlp_test_preds, mlp_test_probs = predict_with_confidence(mlp_model, test_loader_mlp, device=device)
cnn_test_preds, cnn_test_probs = predict_with_confidence(cnn_model, test_loader_cnn, device=device)

cnn_test_probs = np.array(cnn_test_probs)
mlp_test_probs = np.array(mlp_test_probs)
bilstm_test_probs = np.array(bilstm_test_probs)

pd.DataFrame({
    "id": list(range(len(cnn_test_preds))),
    "label": cnn_test_preds
}).to_csv("cnn_submission.csv", index=False)

pd.DataFrame({
    "id": list(range(len(mlp_test_preds))),
    "label": mlp_test_preds
}).to_csv("mlp_submission.csv", index=False)

pd.DataFrame({
    "id": list(range(len(bilstm_test_preds))),
    "label": bilstm_test_preds
}).to_csv("bilstm_submission.csv", index=False)

X_meta_test = np.vstack([
        cnn_test_probs,
        (cnn_test_probs >= 0.5).astype(float),
        np.abs(cnn_test_probs - 0.5),
        mlp_test_probs,
        (mlp_test_probs >= 0.5).astype(float),
        np.abs(mlp_test_probs - 0.5),
        bilstm_test_probs,
        (bilstm_test_probs >= 0.5).astype(float),
        np.abs(bilstm_test_probs - 0.5)
    ]).T

X_meta_test_tensor = torch.tensor(X_meta_test, dtype=torch.float32).to(device)
meta_nn.eval()
with torch.no_grad():
    logits = meta_nn(X_meta_test_tensor)
    probs = torch.sigmoid(logits).cpu().numpy()
ensemble_probs = probs
ensemble_preds = (probs >= 0.5).astype(int)

results_df = pd.DataFrame({
    "id": list(range(len(ensemble_preds))),
    "label": ensemble_preds.astype(int),
    "confidence": ensemble_probs.flatten()
})

# === Step 9: Save Predictions ===
results_df.to_csv("ensemble_submission_final.csv", index=False)
results_df[["id", "label"]].to_csv("ensemble_labels_final.csv", index=False)
print("Final ensemble predictions saved.")


[MetaNN] Epoch 1 — Train Loss: 0.1514
[MetaNN] Epoch 2 — Train Loss: 0.1123
[MetaNN] Epoch 3 — Train Loss: 0.0859
[MetaNN] Epoch 4 — Train Loss: 0.0718
[MetaNN] Epoch 5 — Train Loss: 0.0638
[MetaNN] Epoch 6 — Train Loss: 0.0598
[MetaNN] Epoch 7 — Train Loss: 0.0586
[MetaNN] Epoch 8 — Train Loss: 0.0556
[MetaNN] Epoch 9 — Train Loss: 0.0556
[MetaNN] Epoch 10 — Train Loss: 0.0550


NameError: name 'tfidf_vectorizer' is not defined