# Ensemble Model

In [1]:
import json
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

import json
import random
from collections import defaultdict, Counter
import itertools

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, WeightedRandomSampler
import torch.nn.functional as F

import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import List


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, dropout=0.5):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        return self.model(x).squeeze()

In [3]:
class CNNTextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, num_filters=100, filter_sizes=(3, 4, 5), dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim,
                      out_channels=num_filters,
                      kernel_size=fs)
            for fs in filter_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_filters * len(filter_sizes), 1)

    def forward(self, x):
        x = self.embedding(x)           # (batch_size, seq_len, emb_dim)
        x = x.permute(0, 2, 1)          # (batch_size, emb_dim, seq_len)
        convs = [F.relu(conv(x)) for conv in self.convs]  # list of (batch_size, num_filters, *)
        pooled = [F.max_pool1d(c, kernel_size=c.size(2)).squeeze(2) for c in convs]
        cat = torch.cat(pooled, dim=1)  # (batch_size, num_filters * len(filter_sizes))
        dropped = self.dropout(cat)
        logits = self.fc(dropped)
        return logits.squeeze()

In [4]:
def tokens_to_text(token_lists: List[List[int]]) -> List[str]:
    return [" ".join(map(str, tokens)) for tokens in token_lists]

In [5]:
def load_json_lines(path):
    texts, labels = [], []
    with open(path, 'r') as f:
        for line in f:
            ex = json.loads(line)
            texts.append(torch.tensor(ex['text'], dtype=torch.long))
            labels.append(ex['label'])
    return texts, labels

def load_test_json(path):
    texts = []
    with open(path, 'r') as f:
        for line in f:
            ex = json.loads(line)
            texts.append(torch.tensor(ex['text'], dtype=torch.long))
    return texts


In [6]:
def compute_embed_stats(token_lists: List[List[int]], embedding_matrix: np.ndarray) -> np.ndarray:
    stats = []
    for tokens in token_lists:
        embedded = np.array([embedding_matrix[t] for t in tokens if t < len(embedding_matrix)])
        if embedded.size == 0:
            mean = np.zeros(embedding_matrix.shape[1])
            std = np.zeros(embedding_matrix.shape[1])
        else:
            mean = embedded.mean(axis=0)
            std = embedded.std(axis=0)
        stats.append(np.concatenate([mean, std]))
    return np.vstack(stats)
from typing import List

In [7]:
def create_weighted_sampler(domains, labels):
    from collections import Counter
    from torch.utils.data import WeightedRandomSampler

    bucket_keys = list(zip(domains, labels))
    bucket_counts = Counter(bucket_keys)
    bucket_weights = {k: 1.0 / count for k, count in bucket_counts.items()}
    sample_weights = torch.DoubleTensor([bucket_weights[k] for k in bucket_keys])
    sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)
    return sampler

In [8]:
def stratified_train_val_split(texts, labels, domains, val_size_per_group=60, random_state=42):
    random.seed(random_state)
    from collections import defaultdict
    buckets = defaultdict(list)

    for x, y, d in zip(texts, labels, domains):
        buckets[(d, y)].append((x, y, d))

    train, val = [], []
    for key in buckets:
        group = buckets[key]
        random.shuffle(group)
        n_val = min(val_size_per_group, len(group))
        val.extend(group[:n_val])
        train.extend(group[n_val:])

    random.shuffle(train)
    random.shuffle(val)
    tx, ty, td = zip(*train)
    vx, vy, vd = zip(*val)
    return list(tx), list(ty), list(td), list(vx), list(vy), list(vd)

In [9]:
def train(model, train_loader, epochs, lr, device="cpu"):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    pos_weight = torch.tensor([
        sum(1 for y in train_loader.dataset.tensors[1] if y == 0) /
        sum(1 for y in train_loader.dataset.tensors[1] if y == 1)
    ], device=device)
    criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch}", leave=False):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            logits = model(X_batch)
            loss = criterion(logits, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch} — Train Loss: {total_loss / len(train_loader):.4f}")

In [10]:
def predict_with_confidence(model, dataloader, device="cpu"):
    model.eval()
    preds, probs = [], []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting", leave=False):
            if len(batch) == 2:
                X_batch, _ = batch  # Ignore labels
            else:
                X_batch = batch[0]
            X_batch = X_batch.to(device)
            logits = model(X_batch)
            batch_probs = torch.sigmoid(logits).squeeze()

            if batch_probs.ndim == 0:
                batch_probs = batch_probs.unsqueeze(0)

            batch_preds = (batch_probs >= 0.5).int().tolist()
            preds.extend(batch_preds)
            probs.extend(batch_probs.cpu().tolist())

    return preds, probs

In [17]:
# === Step 0: Load and Split Data ===
d1_texts, d1_labels = load_json_lines("data/domain1_train_data.json")
d2_texts, d2_labels = load_json_lines("data/domain2_train_data.json")
texts = d1_texts + d2_texts
labels = d1_labels + d2_labels
domains = [0] * len(d1_labels) + [1] * len(d2_labels)

test_texts_raw = load_test_json("data/test_data.json")
test_texts = pad_sequence(test_texts_raw, batch_first=True, padding_value=0)

test_dataset_cnn = TensorDataset(test_texts)
test_loader_cnn = DataLoader(test_dataset_cnn, batch_size=32)

# Stratified validation split: 60 from each domain
train_texts, train_labels, train_domains, val_texts, val_labels, val_domains = stratified_train_val_split(
    texts, labels, domains, val_size_per_group=60
)

# Pad after splitting
train_texts_padded = pad_sequence(train_texts, batch_first=True, padding_value=0)
val_texts_padded = pad_sequence(val_texts, batch_first=True, padding_value=0)

# === Step 1: Token Conversion and Text Processing ===
train_text_strings = tokens_to_text(train_texts)
val_text_strings = tokens_to_text(val_texts)
full_text_strings = tokens_to_text(train_texts + val_texts)
test_text_strings = tokens_to_text(test_texts_raw)

# === Step 2: TF-IDF + SVD for MLP ===
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_text_strings).toarray()
X_val_tfidf = tfidf_vectorizer.transform(val_text_strings).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_text_strings).toarray()

svd = TruncatedSVD(n_components=100, random_state=42)
X_train_svd = svd.fit_transform(X_train_tfidf)
X_val_svd = svd.transform(X_val_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

embedding_layer = nn.Embedding(17120, 128, padding_idx=0)
embedding_matrix = embedding_layer.weight.detach().cpu().numpy()

train_embed_stats = compute_embed_stats(train_texts, embedding_matrix)
val_embed_stats = compute_embed_stats(val_texts, embedding_matrix)
test_embed_stats = compute_embed_stats(test_texts_raw, embedding_matrix)

X_train_raw = np.hstack([X_train_svd, train_embed_stats])
X_val_raw = np.hstack([X_val_svd, val_embed_stats])
X_test_raw = np.hstack([X_test_svd, test_embed_stats])

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_raw)
X_val_std = scaler.transform(X_val_raw)
X_test_std = scaler.transform(X_test_raw)

# === Step 3: Prepare MLP Datasets ===
X_train_tensor = torch.tensor(X_train_std, dtype=torch.float32)
y_train_tensor = torch.tensor(train_labels, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_std, dtype=torch.float32)
y_val_tensor = torch.tensor(val_labels, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_std, dtype=torch.float32)

train_dataset_mlp = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset_mlp = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset_mlp = TensorDataset(X_test_tensor)

train_loader_mlp = DataLoader(train_dataset_mlp, batch_size=64, sampler=create_weighted_sampler(train_domains, train_labels))
val_loader_mlp = DataLoader(val_dataset_mlp, batch_size=64)
test_loader_mlp = DataLoader(test_dataset_mlp, batch_size=64)

# === Step 4: Train MLP on Train Set ===
mlp_model = MLPClassifier(input_dim=X_train_tensor.shape[1], hidden_dim=256, dropout=0.5).to(device)
train(mlp_model, train_loader_mlp, epochs=6, lr=1e-3, device=device)

# === Step 5: Train CNN on Train Set ===


train_dataset_cnn = TensorDataset(train_texts_padded, torch.tensor(train_labels, dtype=torch.float32))
val_dataset_cnn = TensorDataset(val_texts_padded, torch.tensor(val_labels, dtype=torch.float32))
test_dataset_cnn = TensorDataset(test_texts)

train_loader_cnn = DataLoader(train_dataset_cnn, batch_size=32, sampler=create_weighted_sampler(train_domains, train_labels))
val_loader_cnn = DataLoader(val_dataset_cnn, batch_size=32)
test_loader_cnn = DataLoader(test_dataset_cnn, batch_size=32)

cnn_model = CNNTextClassifier(
    vocab_size=17120,
    embedding_dim=128,
    num_filters=50,
    filter_sizes=(2, 3, 4),
    dropout=0.5
).to(device)

train(cnn_model, train_loader_cnn, epochs=4, lr=0.001, device=device)

# === Step 6: Train Meta-Classifier on Validation Predictions ===
mlp_val_preds, mlp_val_probs = predict_with_confidence(mlp_model, val_loader_mlp, device=device)
cnn_val_preds, cnn_val_probs = predict_with_confidence(cnn_model, val_loader_cnn, device=device)

X_meta_val = np.vstack([cnn_val_probs, mlp_val_probs]).T
y_meta_val = np.array(val_labels)

meta_clf = LogisticRegression()
meta_clf.fit(X_meta_val, y_meta_val)

# === Step 7: Recompute Features and Retrain CNN and MLP on Full Set ===
# Recreate datasets first
full_texts_raw = train_texts + val_texts
full_labels = train_labels + val_labels
full_text_strings = tokens_to_text(full_texts_raw)

# Refit TF-IDF and SVD
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_full_tfidf = tfidf_vectorizer.fit_transform(full_text_strings).toarray()
X_test_tfidf = tfidf_vectorizer.transform(test_text_strings).toarray()

svd = TruncatedSVD(n_components=100, random_state=42)
X_full_svd = svd.fit_transform(X_full_tfidf)
X_test_svd = svd.transform(X_test_tfidf)

full_embed_stats = compute_embed_stats(full_texts_raw, embedding_matrix)
test_embed_stats = compute_embed_stats(test_texts, embedding_matrix)

X_full_raw = np.hstack([X_full_svd, full_embed_stats])
X_test_raw = np.hstack([X_test_svd, test_embed_stats])

scaler = StandardScaler()
X_full_std = scaler.fit_transform(X_full_raw)
X_test_std = scaler.transform(X_test_raw)

X_full_tensor = torch.tensor(X_full_std, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_std, dtype=torch.float32)
# Recreate padded dataset for CNN
full_texts_padded = pad_sequence(train_texts + val_texts, batch_first=True, padding_value=0)
full_labels = train_labels + val_labels

# X_full_tensor already set above with recomputed features
y_full_tensor = torch.tensor(full_labels, dtype=torch.float32)
full_dataset_mlp = TensorDataset(X_full_tensor, y_full_tensor)
full_loader_mlp = DataLoader(full_dataset_mlp, batch_size=64, sampler=create_weighted_sampler(train_domains + val_domains, full_labels))

mlp_model = MLPClassifier(input_dim=X_full_tensor.shape[1], hidden_dim=256, dropout=0.5).to(device)
train(mlp_model, full_loader_mlp, epochs=6, lr=1e-3, device=device)


full_dataset_cnn = TensorDataset(full_texts_padded, torch.tensor(full_labels, dtype=torch.float32))
full_loader_cnn = DataLoader(full_dataset_cnn, batch_size=32, sampler=create_weighted_sampler(train_domains + val_domains, full_labels))

cnn_model = CNNTextClassifier(
    vocab_size=17120,
    embedding_dim=128,
    num_filters=50,
    filter_sizes=(2, 3, 4),
    dropout=0.5
).to(device)
train(cnn_model, full_loader_cnn, epochs=4, lr=0.001, device=device)

# === Step 8: Predict on Test and Use Meta-Classifier ===
mlp_test_preds, mlp_test_probs = predict_with_confidence(mlp_model, test_loader_mlp, device=device)
cnn_test_preds, cnn_test_probs = predict_with_confidence(cnn_model, test_loader_cnn, device=device)

X_meta_test = np.vstack([cnn_test_probs, mlp_test_probs]).T
ensemble_preds = meta_clf.predict(X_meta_test)
ensemble_probs = meta_clf.predict_proba(X_meta_test)[:, 1]

# === Step 9: Save Predictions ===
results_df = pd.DataFrame({
    "id": list(range(len(ensemble_preds))),
    "label": ensemble_preds.astype(int),
    "confidence": ensemble_probs
})
results_df.to_csv("ensemble_submission_final.csv", index=False)
results_df[["id", "label"]].to_csv("ensemble_labels_final.csv", index=False)
print("✅ Final ensemble predictions saved.")

KeyboardInterrupt: 