<a href="https://colab.research.google.com/github/alisa7979/20252R0136DATA30400/blob/main/v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import random
import time
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score
from collections import defaultdict
import networkx as nx
import openai

# ==========================================
# 1. Configuration
# ==========================================
CONFIG = {
    'seed': 42,
    'max_len': 128,
    'batch_size': 16,
    'epochs': 4,
    'lr': 2e-5,
    'silver_threshold': 1,
    'pseudo_conf_threshold': 0.70,
    'consistency_weight': 1.0,
    'num_classes': 531,
    'model_name': 'bert-base-uncased',
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(CONFIG['seed'])
print(f"Device: {CONFIG['device']}")

Device: cuda


In [None]:
# ==========================================
# 2. Data Loading & Silver Labels (Regex)
# ==========================================
def load_data():
    print("Loading data files...")
    classes_df = pd.read_csv('classes.txt', sep='\t', header=None, names=['id', 'name'])
    id2name = dict(zip(classes_df['id'], classes_df['name']))
    name2id = dict(zip(classes_df['name'], classes_df['id']))

    hierarchy_df = pd.read_csv('class_hierarchy.txt', sep='\t', header=None, names=['parent', 'child'])

    class_keywords = {}
    with open('class_related_keywords.txt', 'r', encoding='utf-8') as f:
        for line in f:
            if ':' in line:
                cls_name, keywords = line.strip().split(':')
                if cls_name in name2id:
                    class_keywords[name2id[cls_name]] = keywords.split(',')

    train_df = pd.read_csv('train_corpus.txt', sep='\t', header=None, names=['id', 'text'], on_bad_lines='skip')
    test_df = pd.read_csv('test_corpus.txt', sep='\t', header=None, names=['id', 'text'], on_bad_lines='skip')

    return classes_df, id2name, name2id, hierarchy_df, class_keywords, train_df, test_df

In [None]:
# ==========================================
# 3. Silver
# ==========================================
def generate_silver_labels_with_hierarchy(train_df, class_keywords, hierarchy_df, threshold=1):
    print("Generating Silver Labels with Hierarchy Expansion...")

    # 1. Build Ancestor Graph
    G = nx.DiGraph()
    for _, row in hierarchy_df.iterrows():
        G.add_edge(row['parent'], row['child'])

    # Pre-compute ancestors for speed
    node_ancestors = {node: nx.ancestors(G, node) for node in G.nodes()}

    # 2. Compile Regex
    keyword_patterns = {}
    for cls_id, keywords in class_keywords.items():
        pattern = re.compile(r'\b(' + '|'.join([re.escape(k) for k in keywords]) + r')\b')
        keyword_patterns[cls_id] = pattern

    silver_data = []
    for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
        text = str(row['text']).lower()
        labels = set()

        # Match Keywords
        for cls_id, pattern in keyword_patterns.items():
            if pattern.search(text):
                labels.add(cls_id)

        # EXPANSION: Add Parents
        parents_to_add = set()
        for label in labels:
            if label in node_ancestors:
                parents_to_add.update(node_ancestors[label])
        labels.update(parents_to_add)

        # Filter Noise
        if labels and len(labels) <= 8:
            silver_data.append({
                'text': row['text'],
                'labels': list(labels),
                'is_pseudo': False
            })

    print(f"Generated {len(silver_data)} rich silver samples.")
    return silver_data

In [None]:
# ==========================================
# 4. Model & Graph Definitions
# ==========================================
def build_adjacency_matrix(num_classes, hierarchy_df):
    adj = torch.eye(num_classes)
    for _, row in hierarchy_df.iterrows():
        p, c = row['parent'], row['child']
        if p < num_classes and c < num_classes:
            adj[p, c] = 1.0
            adj[c, p] = 1.0
    deg = torch.sum(adj, dim=1)
    d_inv_sqrt = torch.diag(torch.pow(deg, -0.5))
    return torch.mm(torch.mm(d_inv_sqrt, adj), d_inv_sqrt).to(CONFIG['device'])

class ReviewDataset(Dataset):
    def __init__(self, data, tokenizer, num_classes, max_len=128, is_test=False):
        self.data = data
        self.tokenizer = tokenizer
        self.num_classes = num_classes
        self.max_len = max_len
        self.is_test = is_test

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = str(item['text']) if isinstance(item, dict) else str(item)
        enc = self.tokenizer.encode_plus(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        out = {'input_ids': enc['input_ids'].flatten(), 'attention_mask': enc['attention_mask'].flatten()}
        if not self.is_test:
            lbl = torch.zeros(self.num_classes)
            for l in item['labels']:
                if l < self.num_classes: lbl[int(l)] = 1.0
            out['labels'] = lbl
        return out

class BertGNNClassifier(nn.Module):
    def __init__(self, num_classes, adj_matrix, model_name='bert-base-uncased'):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        self.adj = adj_matrix
        self.class_emb = nn.Parameter(torch.Tensor(num_classes, 768))
        nn.init.xavier_uniform_(self.class_emb)
        self.gnn_weight = nn.Parameter(torch.Tensor(768, 768))
        nn.init.xavier_uniform_(self.gnn_weight)

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.drop(out.last_hidden_state[:, 0, :])
        # GNN Step: Aggregate neighbor info
        class_feat = torch.tanh(torch.mm(torch.mm(self.adj, self.class_emb), self.gnn_weight))
        logits = torch.mm(pooled, class_feat.t())
        return logits

In [None]:
# ==========================================
# 5. LLM Labeling
# ==========================================
openai.api_key = "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"

def generate_llm_labels_active(train_df, silver_data, id2name, sample_size=500):
    # Identify "Hard" examples (rows that Regex failed to label)
    silver_texts = set(d['text'] for d in silver_data)
    hard_df = train_df[~train_df['text'].isin(silver_texts)]

    print(f"Found {len(hard_df)} hard examples. Sampling {sample_size} for LLM...")
    if len(hard_df) > sample_size:
        subset = hard_df.sample(n=sample_size, random_state=42)
    else:
        subset = hard_df

    taxonomy_str = "\n".join([f"{i}: {name}" for i, name in id2name.items()])
    llm_data = []
    logs = []

    for idx, row in tqdm(subset.iterrows(), total=len(subset)):
        text = str(row['text'])[:1000]
        user_msg = f"""Classify into 2-3 categories. JSON format: {{"labels": [12, 45]}}\nReview: "{text}"\nTaxonomy:\n{taxonomy_str}"""

        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "system", "content": "JSON only."}, {"role": "user", "content": user_msg}],
                response_format={"type": "json_object"}, temperature=0, max_tokens=100
            )
            content = response.choices[0].message.content
            logs.append({"id": row['id'], "prompt": user_msg, "output": content})

            data = json.loads(content)
            labels = [int(l) for l in data.get("labels", []) if int(l) < 531]
            if labels:
                llm_data.append({'text': row['text'], 'labels': labels, 'is_pseudo': False})
        except:
            time.sleep(1)

    with open("llm_logs.json", "w") as f: json.dump(logs, f)
    return llm_data

In [None]:
classes_df, id2name, name2id, hierarchy_df, class_keywords, train_df, test_df = load_data()
adj_matrix = build_adjacency_matrix(CONFIG['num_classes'], hierarchy_df)
silver_data = generate_silver_labels_with_hierarchy(train_df, class_keywords, hierarchy_df)
print(f"Silver Label Training Samples: {len(silver_data)}")

llm_data = generate_llm_labels_active(train_df, silver_data, id2name, sample_size=500)
silver_data += llm_data
print(f"Silver + LLM Training Samples: {len(silver_data)}")

Loading data files...
Generating Silver Labels with Hierarchy Expansion...


100%|██████████| 29487/29487 [03:20<00:00, 146.75it/s]


Generated 14477 rich silver samples.
Silver Label Training Samples: 14477
Found 15010 hard examples. Sampling 500 for LLM...


100%|██████████| 500/500 [09:06<00:00,  1.09s/it]

Silver + LLM Training Samples: 14919





In [None]:
# ==========================================
# 5. Master Training with Over-Sampling (WeightedRandomSampler)
# ==========================================
from torch.utils.data import WeightedRandomSampler
from collections import Counter

# 1. Calculate Class Weights (for Loss Function)
label_counts = torch.zeros(CONFIG['num_classes'])
for item in silver_data:
    for label in item['labels']:
        label_counts[int(label)] += 1

label_counts = torch.clamp(label_counts, min=1)
# High weight for rare classes in the Loss Function
pos_weights = len(silver_data) / (CONFIG['num_classes'] * label_counts)
pos_weights = torch.clamp(pos_weights, max=20.0).to(CONFIG['device'])

In [None]:
# 2. Calculate Sample Weights (for Data Loader / Over-sampling)
sample_weights = []
for item in silver_data:
    labels = item['labels']
    if not labels:
        sample_weights.append(0)
        continue

    # Get the frequency of each label in this review
    counts = [label_counts[int(l)].item() for l in labels]

    # The weight is the inverse of the smallest count (the rarest class)
    # 1.0 / min(counts) ensures rare samples get picked much more often.
    weight = 1.0 / min(counts)
    sample_weights.append(weight)

# Convert to Tensor and create Sampler
sample_weights = torch.DoubleTensor(sample_weights)
# replacement=True allows over-sampling (picking the same rare row multiple times)
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

print("Weighted Sampler created. Rare classes will now be over-sampled.")

Weighted Sampler created. Rare classes will now be over-sampled.


In [None]:
# 3. Setup Model & Loader
tokenizer = BertTokenizer.from_pretrained(CONFIG['model_name'])
model = BertGNNClassifier(CONFIG['num_classes'], adj_matrix).to(CONFIG['device'])
optimizer = AdamW(model.parameters(), lr=CONFIG['lr'])

train_dataset = ReviewDataset(silver_data, tokenizer, CONFIG['num_classes'])

train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], sampler=sampler, shuffle=False)

scheduler = get_linear_schedule_with_warmup(optimizer, 0, len(train_loader)*CONFIG['epochs'])

# Use Weighted BCE Loss
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
print("\n--- Phase 1: Supervised Training (With Over-Sampling) ---")
for epoch in range(CONFIG['epochs']):
    model.train()
    losses = []

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        ids = batch['input_ids'].to(CONFIG['device'])
        mask = batch['attention_mask'].to(CONFIG['device'])
        targets = batch['labels'].to(CONFIG['device'])

        # Consistency Reg: Two forward passes
        logits_1 = model(ids, mask)
        logits_2 = model(ids, mask)

        cls_loss = criterion(logits_1, targets)
        cons_loss = F.mse_loss(torch.sigmoid(logits_1), torch.sigmoid(logits_2))
        loss = cls_loss + (CONFIG['consistency_weight'] * cons_loss)

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        losses.append(loss.item())

    print(f"Epoch {epoch+1} Loss: {np.mean(losses):.4f}")


--- Phase 1: Supervised Training (With Over-Sampling) ---


Epoch 1: 100%|██████████| 933/933 [11:14<00:00,  1.38it/s]


Epoch 1 Loss: 0.0819


Epoch 2: 100%|██████████| 933/933 [11:11<00:00,  1.39it/s]


Epoch 2 Loss: 0.0221


Epoch 3: 100%|██████████| 933/933 [11:11<00:00,  1.39it/s]


Epoch 3 Loss: 0.0159


Epoch 4: 100%|██████████| 933/933 [11:11<00:00,  1.39it/s]

Epoch 4 Loss: 0.0135





In [None]:
# 4. Phase 2: Self-Training
print("\n--- Phase 2: Self-Training ---")
model.eval()
unlabeled_dataset = ReviewDataset(train_df['text'].tolist(), tokenizer, CONFIG['num_classes'], is_test=True)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=16, shuffle=False)

pseudo_data = []
with torch.no_grad():
    for i, batch in enumerate(tqdm(unlabeled_loader, desc="Pseudo-labeling")):
        ids = batch['input_ids'].to(CONFIG['device'])
        mask = batch['attention_mask'].to(CONFIG['device'])
        probs = torch.sigmoid(model(ids, mask)).cpu().numpy()

        for j, p in enumerate(probs):
            indices = np.where(p > CONFIG['pseudo_conf_threshold'])[0]
            if len(indices) >= 2:
                 pseudo_data.append({'text': train_df.iloc[i*16+j]['text'], 'labels': indices.tolist()})

print(f"Retraining on {len(silver_data) + len(pseudo_data)} samples...")

# For Phase 2, we usually stick to standard shuffling or re-calculate weights.
# For simplicity, we use standard shuffle here to avoid overfitting noise.
combined_loader = DataLoader(ReviewDataset(silver_data + pseudo_data, tokenizer, CONFIG['num_classes']), batch_size=16, shuffle=True)
optimizer = AdamW(model.parameters(), lr=1e-5)

for epoch in range(2):
    model.train()
    for batch in tqdm(combined_loader, desc=f"Self-Train Epoch {epoch+1}"):
        ids, mask, targets = batch['input_ids'].to(CONFIG['device']), batch['attention_mask'].to(CONFIG['device']), batch['labels'].to(CONFIG['device'])
        loss = criterion(model(ids, mask), targets)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


--- Phase 2: Self-Training ---


Pseudo-labeling: 100%|██████████| 1843/1843 [04:06<00:00,  7.46it/s]


Retraining on 14991 samples...


Self-Train Epoch 1: 100%|██████████| 937/937 [05:36<00:00,  2.78it/s]
Self-Train Epoch 2: 100%|██████████| 937/937 [05:36<00:00,  2.78it/s]


In [None]:
# ==========================================
# 7. Final Inference
# ==========================================
print("\n--- Inference on Test Set ---")
test_loader = DataLoader(ReviewDataset(test_df['text'].tolist(), tokenizer, CONFIG['num_classes'], is_test=True), batch_size=16, shuffle=False)
model.eval()
preds = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        ids = batch['input_ids'].to(CONFIG['device'])
        mask = batch['attention_mask'].to(CONFIG['device'])
        probs = torch.sigmoid(model(ids, mask)).cpu().numpy()

        for p in probs:
            # Sort descending
            top = p.argsort()[::-1]

            # MANDATORY: Take Top 2
            final = [top[0], top[1]]

            # CONDITIONAL: Take 3rd if confident (> 0.4) OR relatively close to 2nd
            # This helps recall for multi-label cases
            if p[top[2]] > 0.4 or (p[top[2]] > 0.7 * p[top[1]]):
                final.append(top[2])

            preds.append(",".join(str(x) for x in sorted(final)))

pd.DataFrame({'id': test_df['id'], 'label': preds}).to_csv('submission.csv', index=False)
print("Saved submission.csv")


--- Inference on Test Set ---


100%|██████████| 1229/1229 [02:45<00:00,  7.42it/s]

Saved submission.csv



