In [4]:
!pip install openai



In [5]:
import os
import re
import random
import time
import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import f1_score
import openai

# ==========================================
# 1. Configuration
# ==========================================
CONFIG = {
    'seed': 42,
    'max_len': 128,
    'batch_size': 16,
    'epochs': 4,              
    'lr': 2e-5,
    'silver_threshold': 1,
    'pseudo_conf_threshold': 0.70, 
    'consistency_weight': 1.0,     
    'num_classes': 531,
    'model_name': 'bert-base-uncased',
    'device': torch.device("cuda" if torch.cuda.is_available() else "cpu")
}

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(CONFIG['seed'])
print(f"Device: {CONFIG['device']}")

# ==========================================
# 2. Focal Loss (The Macro-F1 Fixer)
# ==========================================
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, logits=True, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduce = reduce

    def forward(self, inputs, targets):
        if self.logits:
            BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        else:
            BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')

        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduce:
            return torch.mean(F_loss)
        else:
            return F_loss

Device: cuda


In [6]:
# ==========================================
# 3. Data Loading & Silver Labels (Regex)
# ==========================================
def load_data():
    print("Loading data files from 'Amazon_products' directory structure...")
    
    # Define paths
    base_dir = 'Amazon_products'
    train_dir = os.path.join(base_dir, 'train')
    test_dir = os.path.join(base_dir, 'test')

    # Load Classes 
    classes_df = pd.read_csv(os.path.join(base_dir, 'classes.txt'), sep='\t', header=None, names=['id', 'name'])
    id2name = dict(zip(classes_df['id'], classes_df['name']))
    name2id = dict(zip(classes_df['name'], classes_df['id']))

    # Load Hierarchy
    hierarchy_df = pd.read_csv(os.path.join(base_dir, 'class_hierarchy.txt'), sep='\t', header=None, names=['parent', 'child'])

    # Load Keywords
    class_keywords = {}
    with open(os.path.join(base_dir, 'class_related_keywords.txt'), 'r', encoding='utf-8') as f:
        for line in f:
            if ':' in line:
                cls_name, keywords = line.strip().split(':')
                if cls_name in name2id:
                    class_keywords[name2id[cls_name]] = keywords.split(',')

    # Load Corpora from their specific subdirectories
    train_df = pd.read_csv(os.path.join(train_dir, 'train_corpus.txt'), sep='\t', header=None, names=['id', 'text'], on_bad_lines='skip')
    test_df = pd.read_csv(os.path.join(test_dir, 'test_corpus.txt'), sep='\t', header=None, names=['id', 'text'], on_bad_lines='skip')

    return classes_df, id2name, name2id, hierarchy_df, class_keywords, train_df, test_df

def generate_silver_labels(train_df, class_keywords, threshold=1):
    print("Generating Silver Labels via Regex...")
    silver_data = []

    # Pre-compile regex for speed and accuracy 
    keyword_patterns = {}
    for cls_id, keywords in class_keywords.items():
        pattern = re.compile(r'\b(' + '|'.join([re.escape(k) for k in keywords]) + r')\b')
        keyword_patterns[cls_id] = pattern

    for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
        text = str(row['text']).lower()
        labels = []
        for cls_id, pattern in keyword_patterns.items():
            if pattern.search(text):
                labels.append(cls_id)

        # Filter noise: samples with >5 labels are likely generic/spam
        if labels and len(labels) <= 5:
            silver_data.append({'text': row['text'], 'labels': labels, 'is_pseudo': False})

    print(f"Generated {len(silver_data)} CLEAN silver samples.")
    return silver_data

In [7]:
# ==========================================
# 4. LLM Labeling
# ==========================================
openai.api_key = "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"

def generate_llm_labels(train_df, id2name, sample_size=500):
    print(f"--- Starting LLM Labeling for {sample_size} samples ---")
    taxonomy_str = "\n".join([f"{i}: {name}" for i, name in id2name.items()])
    subset = train_df.sample(n=sample_size, random_state=42)
    llm_data = []
    logs = []

    for idx, row in tqdm(subset.iterrows(), total=len(subset)):
        text = str(row['text'])[:1000]
        user_msg = f"""Classify review into 2-3 categories. Output JSON: {{"labels": [12, 45]}}\nReview: "{text}"\nTaxonomy:\n{taxonomy_str}"""

        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "system", "content": "JSON only."}, {"role": "user", "content": user_msg}],
                response_format={"type": "json_object"}, temperature=0, max_tokens=100
            )
            content = response.choices[0].message.content
            logs.append({"id": row['id'], "prompt": user_msg, "output": content})

            data = json.loads(content)
            labels = [int(l) for l in data.get("labels", []) if int(l) < 531]
            if labels:
                llm_data.append({'text': row['text'], 'labels': labels, 'is_pseudo': False})
        except:
            time.sleep(1)

    with open("llm_logs.json", "w") as f: json.dump(logs, f)
    return llm_data

In [8]:
# ==========================================
# 5. Model & Graph Definitions
# ==========================================
def build_adjacency_matrix(num_classes, hierarchy_df):
    adj = torch.eye(num_classes)
    for _, row in hierarchy_df.iterrows():
        p, c = row['parent'], row['child']
        if p < num_classes and c < num_classes:
            adj[p, c] = 1.0
            adj[c, p] = 1.0
    deg = torch.sum(adj, dim=1)
    d_inv_sqrt = torch.diag(torch.pow(deg, -0.5))
    return torch.mm(torch.mm(d_inv_sqrt, adj), d_inv_sqrt).to(CONFIG['device'])

class ReviewDataset(Dataset):
    def __init__(self, data, tokenizer, num_classes, max_len=128, is_test=False):
        self.data = data
        self.tokenizer = tokenizer
        self.num_classes = num_classes
        self.max_len = max_len
        self.is_test = is_test

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = str(item['text']) if isinstance(item, dict) else str(item)
        enc = self.tokenizer.encode_plus(text, max_length=self.max_len, padding='max_length', truncation=True, return_tensors='pt')
        out = {'input_ids': enc['input_ids'].flatten(), 'attention_mask': enc['attention_mask'].flatten()}
        if not self.is_test:
            lbl = torch.zeros(self.num_classes)
            for l in item['labels']:
                if l < self.num_classes: lbl[int(l)] = 1.0
            out['labels'] = lbl
        return out

class BertGNNClassifier(nn.Module):
    def __init__(self, num_classes, adj_matrix, model_name='bert-base-uncased'):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.drop = nn.Dropout(0.3)
        self.adj = adj_matrix
        self.class_emb = nn.Parameter(torch.Tensor(num_classes, 768))
        nn.init.xavier_uniform_(self.class_emb)
        self.gnn_weight = nn.Parameter(torch.Tensor(768, 768))
        nn.init.xavier_uniform_(self.gnn_weight)

    def forward(self, input_ids, attention_mask):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled = self.drop(out.last_hidden_state[:, 0, :])
        # GNN Step: Aggregate neighbor info
        class_feat = torch.tanh(torch.mm(torch.mm(self.adj, self.class_emb), self.gnn_weight))
        logits = torch.mm(pooled, class_feat.t())
        return logits

In [9]:
# ==========================================
# 6. Master Training Pipeline
# ==========================================

# 1. Setup Data
classes_df, id2name, name2id, hierarchy_df, class_keywords, train_df, test_df = load_data()
adj_matrix = build_adjacency_matrix(CONFIG['num_classes'], hierarchy_df)
silver_data = generate_silver_labels(train_df, class_keywords, CONFIG['silver_threshold'])

llm_data = generate_llm_labels(train_df, id2name)
silver_data += llm_data

tokenizer = BertTokenizer.from_pretrained(CONFIG['model_name'])
model = BertGNNClassifier(CONFIG['num_classes'], adj_matrix).to(CONFIG['device'])

# 2. Training Setup
train_dataset = ReviewDataset(silver_data, tokenizer, CONFIG['num_classes'])
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
optimizer = AdamW(model.parameters(), lr=CONFIG['lr'])
scheduler = get_linear_schedule_with_warmup(optimizer, 0, len(train_loader)*CONFIG['epochs'])

criterion = FocalLoss(gamma=2)

Loading data files from 'Amazon_products' directory structure...
Generating Silver Labels via Regex...


100%|██████████| 29487/29487 [02:04<00:00, 237.49it/s]


Generated 17225 CLEAN silver samples.
--- Starting LLM Labeling for 500 samples ---


100%|██████████| 500/500 [07:16<00:00,  1.15it/s]


In [10]:
print("\n--- Phase 1: Supervised Training (Silver + Consistency) ---")
for epoch in range(CONFIG['epochs']):
    model.train()
    losses = []

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        ids = batch['input_ids'].to(CONFIG['device'])
        mask = batch['attention_mask'].to(CONFIG['device'])
        targets = batch['labels'].to(CONFIG['device'])

        # --- Consistency Regularization Step ---
        # Forward Pass 1 (Standard Dropout)
        logits_1 = model(ids, mask)

        # Forward Pass 2 (Standard Dropout again - random noise differs)
        logits_2 = model(ids, mask)

        # Main Classification Loss
        cls_loss = criterion(logits_1, targets)

        # Consistency Loss (MSE between the two passes)
        # Forces model to be stable regardless of dropout noise
        cons_loss = F.mse_loss(torch.sigmoid(logits_1), torch.sigmoid(logits_2))

        # Total Loss
        loss = cls_loss + (CONFIG['consistency_weight'] * cons_loss)

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        losses.append(loss.item())

    print(f"Epoch {epoch+1} Loss: {np.mean(losses):.4f}")


--- Phase 1: Supervised Training (Silver + Consistency) ---


Epoch 1: 100%|██████████| 1107/1107 [05:52<00:00,  3.14it/s]


Epoch 1 Loss: 0.0128


Epoch 2: 100%|██████████| 1107/1107 [06:00<00:00,  3.07it/s]


Epoch 2 Loss: 0.0049


Epoch 3: 100%|██████████| 1107/1107 [06:00<00:00,  3.07it/s]


Epoch 3 Loss: 0.0036


Epoch 4: 100%|██████████| 1107/1107 [05:59<00:00,  3.08it/s]

Epoch 4 Loss: 0.0031





In [11]:
# 3. Phase 2: Self-Training
print("\n--- Phase 2: Self-Training (Pseudo-Labeling) ---")
model.eval()
unlabeled_dataset = ReviewDataset(train_df['text'].tolist(), tokenizer, CONFIG['num_classes'], is_test=True)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

pseudo_data = []
with torch.no_grad():
    for i, batch in enumerate(tqdm(unlabeled_loader, desc="Generating Pseudo Labels")):
        ids = batch['input_ids'].to(CONFIG['device'])
        mask = batch['attention_mask'].to(CONFIG['device'])
        probs = torch.sigmoid(model(ids, mask)).cpu().numpy()

        for j, p in enumerate(probs):
            # Dynamic check: Pick confident classes
            indices = np.where(p > CONFIG['pseudo_conf_threshold'])[0]
            if len(indices) >= 2:
                 pseudo_data.append({'text': train_df.iloc[i*16+j]['text'], 'labels': indices.tolist()})

print(f"Generated {len(pseudo_data)} pseudo-labels. Retraining...")
combined_data = silver_data + pseudo_data
train_loader = DataLoader(ReviewDataset(combined_data, tokenizer, CONFIG['num_classes']), batch_size=16, shuffle=True)
optimizer = AdamW(model.parameters(), lr=1e-5) 

for epoch in range(2):
    model.train()
    for batch in tqdm(train_loader, desc=f"Self-Train Epoch {epoch+1}"):
        ids, mask, targets = batch['input_ids'].to(CONFIG['device']), batch['attention_mask'].to(CONFIG['device']), batch['labels'].to(CONFIG['device'])
        loss = criterion(model(ids, mask), targets)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


--- Phase 2: Self-Training (Pseudo-Labeling) ---


Generating Pseudo Labels: 100%|██████████| 1843/1843 [01:47<00:00, 17.22it/s]


Generated 3996 pseudo-labels. Retraining...


Self-Train Epoch 1: 100%|██████████| 1357/1357 [03:54<00:00,  5.79it/s]
Self-Train Epoch 2: 100%|██████████| 1357/1357 [03:55<00:00,  5.77it/s]


In [12]:
# ==========================================
# 7. Final Inference
# ==========================================
print("\n--- Inference on Test Set ---")
test_loader = DataLoader(ReviewDataset(test_df['text'].tolist(), tokenizer, CONFIG['num_classes'], is_test=True), batch_size=16, shuffle=False)
model.eval()
preds = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        ids = batch['input_ids'].to(CONFIG['device'])
        mask = batch['attention_mask'].to(CONFIG['device'])
        probs = torch.sigmoid(model(ids, mask)).cpu().numpy()

        for p in probs:
            # Sort descending
            top = p.argsort()[::-1]

            # MANDATORY: Take Top 2
            final = [top[0], top[1]]

            # CONDITIONAL: Take 3rd if confident (> 0.4) OR relatively close to 2nd
            # This helps recall for multi-label cases
            if p[top[2]] > 0.4 or (p[top[2]] > 0.7 * p[top[1]]):
                final.append(top[2])

            preds.append(",".join(str(x) for x in sorted(final)))

pd.DataFrame({'id': test_df['id'], 'label': preds}).to_csv('submission.csv', index=False)
print("Saved submission.csv")


--- Inference on Test Set ---


100%|██████████| 1229/1229 [01:11<00:00, 17.21it/s]

Saved submission.csv



