In [2]:
import zipfile
import os

# Path to Outbrain data
data_dir = "ctr_project/data/outbrain"

# Find all zip files
zip_files = [f for f in os.listdir(data_dir) if f.endswith('.zip')]
print(f"Found {len(zip_files)} zip files:")
for f in zip_files:
    print(f"  - {f}")

# Unzip each file
for zip_file in zip_files:
    zip_path = os.path.join(data_dir, zip_file)
    print(f"\nUnzipping {zip_file}...")
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_dir)
    
    print(f"  ✓ Extracted {zip_file}")

# List all extracted CSV files
print("\n" + "="*60)
print("Extracted files:")
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
for f in csv_files:
    size_mb = os.path.getsize(os.path.join(data_dir, f)) / 1e6
    print(f"  - {f} ({size_mb:.1f} MB)")

print("\n✅ All files unzipped!")

Found 6 zip files:
  - documents_topics.csv.zip
  - documents_categories.csv.zip
  - events.csv.zip
  - promoted_content.csv.zip
  - clicks_train.csv.zip
  - documents_meta.csv.zip

Unzipping documents_topics.csv.zip...
  ✓ Extracted documents_topics.csv.zip

Unzipping documents_categories.csv.zip...
  ✓ Extracted documents_categories.csv.zip

Unzipping events.csv.zip...
  ✓ Extracted events.csv.zip

Unzipping promoted_content.csv.zip...
  ✓ Extracted promoted_content.csv.zip

Unzipping clicks_train.csv.zip...
  ✓ Extracted clicks_train.csv.zip

Unzipping documents_meta.csv.zip...
  ✓ Extracted documents_meta.csv.zip

Extracted files:
  - documents_topics.csv (339.5 MB)
  - documents_categories.csv (118.0 MB)
  - events.csv (1208.5 MB)
  - promoted_content.csv (13.9 MB)
  - clicks_train.csv (1486.7 MB)
  - documents_meta.csv (89.4 MB)

✅ All files unzipped!


In [3]:
!pip uninstall -y pandas numpy
!pip install --no-cache-dir -U "numpy<2.0" "pandas==2.1.4"


Found existing installation: pandas 2.0.3
Uninstalling pandas-2.0.3:
  Successfully uninstalled pandas-2.0.3
Found existing installation: numpy 2.2.6
Uninstalling numpy-2.2.6:
  Successfully uninstalled numpy-2.2.6
Defaulting to user installation because normal site-packages is not writeable
Collecting numpy<2.0
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m152.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pandas==2.1.4
  Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m144.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: numpy, pandas
Successfully installed numpy-1.26.4 pandas-2.1.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m

In [1]:
import numpy as np, pandas as pd
print(np.__version__, pd.__version__)


1.26.4 2.1.4


In [5]:
# ============================================================
# OUTBRAIN FT-AFM PIPELINE
# Content recommendation CTR prediction
# ============================================================
import os, json, math, random, numpy as np, pandas as pd, torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import KFold, train_test_split
from torch.amp import GradScaler, autocast
from datetime import datetime

# ============== Setup ==============
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seed(42)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_GPUS = torch.cuda.device_count()

# ============== Models (Copy from previous scripts) ==============
class ImprovedAFM(nn.Module):
    """AFM with larger attention dimension and dropout"""
    def __init__(self, d, attn_dim=64, dropout=0.1):
        super().__init__()
        self.W = nn.Linear(d, attn_dim, bias=False)
        self.h = nn.Linear(attn_dim, 1, bias=False)
        self.dropout = nn.Dropout(dropout)
    def forward(self, E):
        B,F,d = E.shape; pairs=[]
        for i in range(F):
            for j in range(i+1, F):
                pairs.append(E[:,i]*E[:,j])
        P = torch.stack(pairs, dim=1)
        P = self.dropout(P)
        A = torch.softmax(self.h(torch.tanh(self.W(P))), dim=1)
        return (A * P).sum(dim=1)

class FeatureTokenizer(nn.Module):
    def __init__(self, cat_cardinalities, n_num, d_model):
        super().__init__()
        self.cat_embs = nn.ModuleList([nn.Embedding(card, d_model) for card in cat_cardinalities])
        self.num_proj = nn.ModuleList([nn.Linear(1, d_model) for _ in range(n_num)])
        self.cls = nn.Parameter(torch.zeros(1,1,d_model)); nn.init.trunc_normal_(self.cls, std=0.02)
    def forward(self, x_cat, x_num):
        B = x_cat.size(0)
        cat_tokens = [emb(x_cat[:, i]) for i, emb in enumerate(self.cat_embs)]
        num_tokens = [proj(x_num[:, i:i+1]) for i, proj in enumerate(self.num_proj)]
        field_embs = torch.stack(cat_tokens + num_tokens, dim=1)
        cls = self.cls.expand(B, -1, -1)
        tokens = torch.cat([cls, field_embs], dim=1)
        return tokens, field_embs

class FTTransformer(nn.Module):
    def __init__(self, d_model=192, nhead=8, ff=512, n_layers=3, dropout=0.15):
        super().__init__()
        enc = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=ff,
                                         dropout=dropout, batch_first=True, activation="gelu", norm_first=True)
        self.encoder = nn.TransformerEncoder(enc, num_layers=n_layers)
    def forward(self, tokens): return self.encoder(tokens)

class ImprovedFTAFM(nn.Module):
    """Improved FT+AFM with larger capacity and better fusion"""
    def __init__(self, cat_cards, n_num, d_model=192, nhead=8, ff=512, n_layers=3, dropout=0.15, afm_attn_dim=64):
        super().__init__()
        self.tok = FeatureTokenizer(cat_cards, n_num, d_model)
        self.backbone = FTTransformer(d_model, nhead, ff, n_layers, dropout)
        self.afm = ImprovedAFM(d_model, afm_attn_dim, dropout=dropout)
        
        # Larger head
        fusion_dim = d_model + d_model
        self.head = nn.Sequential(
            nn.Linear(fusion_dim, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1)
        )
        
    def forward(self, x_cat, x_num):
        tokens, field_embs = self.tok(x_cat, x_num)
        H = self.backbone(tokens); h_cls = H[:,0,:]
        v_afm = self.afm(field_embs)
        z = torch.cat([h_cls, v_afm], dim=1)
        return self.head(z).squeeze(1)

# ============== Utils ==============
def ensure_dir(p): os.makedirs(p, exist_ok=True)

def save_json(obj, path):
    with open(path, "w") as f:
        json.dump(obj, f, indent=2)

def kfold_target_encode(df_tr, df_va, col, yname, n_splits=5, min_samples=50):
    prior = float(df_tr[yname].mean())
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    te_tr = pd.Series(np.zeros(len(df_tr), dtype="float32"), index=df_tr.index)
    for tr_idx, hold_idx in kf.split(df_tr):
        cur = df_tr.iloc[tr_idx]
        means = cur.groupby(col)[yname].mean()
        cnts  = cur.groupby(col)[yname].size()
        m = (means*cnts + prior*min_samples) / (cnts + min_samples)
        te_tr.iloc[hold_idx] = df_tr.iloc[hold_idx][col].map(m).fillna(prior).astype("float32")
    means = df_tr.groupby(col)[yname].mean()
    cnts  = df_tr.groupby(col)[yname].size()
    mfull = (means*cnts + prior*min_samples) / (cnts + min_samples)
    te_va = df_va[col].map(mfull).fillna(prior).astype("float32")
    return te_tr, te_va

# ============== Outbrain Preprocessing ==============
def preprocess_outbrain(
    data_dir="/ctr_project/data/outbrain",
    output_dir="outbrain_preprocessed",
    sample_size=None  # Set to e.g., 10M for faster processing
):
    """
    Preprocess Outbrain click prediction data
    
    Files needed in data_dir:
    - clicks_train.csv: (display_id, ad_id, clicked)
    - events.csv: (display_id, uuid, timestamp, document_id, platform, geo_location)
    - promoted_content.csv: (ad_id, document_id, campaign_id, advertiser_id)
    
    Optional (for richer features):
    - documents_meta.csv
    - documents_categories.csv
    - documents_topics.csv
    """
    
    print("\n" + "="*80)
    print("OUTBRAIN PREPROCESSING")
    print("="*80)
    
    ensure_dir(output_dir)
    
    # Load clicks
    print("Loading clicks_train.csv...")
    clicks = pd.read_csv(f"{data_dir}/clicks_train.csv")
    print(f"  Loaded {len(clicks):,} clicks")
    
    if sample_size:
        print(f"  Sampling {sample_size:,} rows...")
        clicks = clicks.sample(n=min(sample_size, len(clicks)), random_state=42)
    
    # Load events
    print("Loading events.csv...")
    events = pd.read_csv(f"{data_dir}/events.csv")
    print(f"  Loaded {len(events):,} events")
    
    # Load promoted content
    print("Loading promoted_content.csv...")
    promoted = pd.read_csv(f"{data_dir}/promoted_content.csv")
    print(f"  Loaded {len(promoted):,} promoted content")
    
    # Merge data
    print("Merging datasets...")
    df = clicks.merge(events, on='display_id', how='left')
    df = df.merge(promoted, on='ad_id', how='left')
    
    print(f"  Merged dataset: {len(df):,} rows")
    print(f"  CTR: {df['clicked'].mean():.4f}")
    
    # Extract temporal features
    print("Extracting temporal features...")
    df['timestamp'] = pd.to_numeric(df['timestamp'], errors='coerce')
    df['hour'] = ((df['timestamp'] / 1000 / 3600) % 24).astype('int32')
    df['day_of_week'] = ((df['timestamp'] / 1000 / 86400) % 7).astype('int32')
    
    # Handle missing values
    print("Handling missing values...")
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna('missing').astype(str)
        else:
            df[col] = df[col].fillna(0)
    
    # Select features
    label_col = 'clicked'
    
    # Categorical features
    cat_cols = ['uuid', 'document_id', 'platform', 'geo_location', 
                'ad_id', 'document_id_y', 'campaign_id', 'advertiser_id']
    cat_cols = [c for c in cat_cols if c in df.columns]
    
    # Numerical features
    num_cols = ['hour', 'day_of_week', 'timestamp']
    
    # Parse geo_location into country and state (format: country>state)
    if 'geo_location' in df.columns:
        print("Parsing geo_location...")
        df['country'] = df['geo_location'].str.split('>').str[0]
        df['state'] = df['geo_location'].str.split('>').str[1].fillna('unknown')
        cat_cols.extend(['country', 'state'])
    
    # 8:1:1 temporal split (Outbrain has temporal ordering)
    print("Splitting data temporally (8:1:1)...")
    df = df.sort_values('timestamp').reset_index(drop=True)
    n = len(df)
    n_tr = int(0.8 * n)
    n_va = int(0.1 * n)
    
    df_tr = df.iloc[:n_tr].copy()
    df_va = df.iloc[n_tr:n_tr+n_va].copy()
    df_te = df.iloc[n_tr+n_va:].copy()
    
    print(f"Train: {len(df_tr):,}, Val: {len(df_va):,}, Test: {len(df_te):,}")
    print(f"Train CTR: {df_tr[label_col].mean():.4f}")
    print(f"Val CTR: {df_va[label_col].mean():.4f}")
    print(f"Test CTR: {df_te[label_col].mean():.4f}")
    
    # Categorical encoding
    print("Encoding categorical features...")
    cat_cards = []
    MIN_FREQ = 10
    
    for c in cat_cols:
        vc = df_tr[c].value_counts()
        frequent = vc[vc >= MIN_FREQ].index
        mapping = {v:i for i,v in enumerate(frequent)}
        unk_id = len(mapping)
        
        df_tr[c] = df_tr[c].map(mapping).fillna(unk_id).astype('int64')
        df_va[c] = df_va[c].map(mapping).fillna(unk_id).astype('int64')
        df_te[c] = df_te[c].map(mapping).fillna(unk_id).astype('int64')
        
        cat_cards.append(unk_id + 1)
        print(f"  {c}: {unk_id+1} categories")
    
    # Frequency features
    print("Engineering frequency features...")
    freq_candidates = ['ad_id', 'document_id', 'campaign_id', 'advertiser_id']
    freq_candidates = [c for c in freq_candidates if c in cat_cols]
    
    for c in freq_candidates:
        vc = df_tr[c].value_counts()
        df_tr[f"{c}_freq"] = df_tr[c].map(vc).astype('float32')
        df_va[f"{c}_freq"] = df_va[c].map(vc).fillna(0).astype('float32')
        df_te[f"{c}_freq"] = df_te[c].map(vc).fillna(0).astype('float32')
        num_cols.append(f"{c}_freq")
    
    # Target encoding
    print("Target encoding...")
    te_candidates = ['ad_id', 'document_id', 'campaign_id']
    te_candidates = [c for c in te_candidates if c in cat_cols]
    
    for c in te_candidates:
        te_tr, te_va = kfold_target_encode(df_tr, df_va, c, label_col, n_splits=5, min_samples=50)
        prior = float(df_tr[label_col].mean())
        means = df_tr.groupby(c)[label_col].mean()
        cnts  = df_tr.groupby(c)[label_col].size()
        mfull = (means*cnts + prior*50) / (cnts + 50)
        te_te = df_te[c].map(mfull).fillna(prior).astype('float32')
        
        df_tr[f"{c}_te"] = te_tr.astype('float32')
        df_va[f"{c}_te"] = te_va.astype('float32')
        df_te[f"{c}_te"] = te_te.astype('float32')
        num_cols.append(f"{c}_te")
    
    # Log1p frequency features
    freq_cols = [c for c in num_cols if c.endswith('_freq')]
    for c in freq_cols:
        for d in (df_tr, df_va, df_te):
            d[c] = np.log1p(d[c].clip(lower=0))
    
    # Standardize numerics
    print("Standardizing numerics...")
    num_means = {c: float(df_tr[c].mean()) for c in num_cols}
    num_stds  = {c: float(df_tr[c].std()) for c in num_cols}
    
    for c in num_cols:
        mu, sd = num_means[c], (num_stds[c] if num_stds[c] > 1e-8 else 1.0)
        for d in (df_tr, df_va, df_te):
            d[c] = ((d[c] - mu)/sd).clip(-5, 5).astype('float32')
    
    # Save preprocessed data
    print("Saving preprocessed data...")
    np.save(f'{output_dir}/Xc_train.npy', df_tr[cat_cols].to_numpy())
    np.save(f'{output_dir}/Xn_train.npy', df_tr[num_cols].to_numpy().astype('float32'))
    np.save(f'{output_dir}/y_train.npy', df_tr[label_col].to_numpy().astype('float32'))
    
    np.save(f'{output_dir}/Xc_val.npy', df_va[cat_cols].to_numpy())
    np.save(f'{output_dir}/Xn_val.npy', df_va[num_cols].to_numpy().astype('float32'))
    np.save(f'{output_dir}/y_val.npy', df_va[label_col].to_numpy().astype('float32'))
    
    np.save(f'{output_dir}/Xc_test.npy', df_te[cat_cols].to_numpy())
    np.save(f'{output_dir}/Xn_test.npy', df_te[num_cols].to_numpy().astype('float32'))
    np.save(f'{output_dir}/y_test.npy', df_te[label_col].to_numpy().astype('float32'))
    
    schema = {
        'cat_cards': cat_cards,
        'num_cols': num_cols,
        'cat_cols': cat_cols,
        'train_ctr': float(df_tr[label_col].mean()),
        'val_ctr': float(df_va[label_col].mean()),
        'test_ctr': float(df_te[label_col].mean())
    }
    save_json(schema, f'{output_dir}/schema.json')
    
    print(f"✅ Preprocessing complete!")
    print(f"   Categorical features: {len(cat_cols)}")
    print(f"   Numerical features: {len(num_cols)}")
    print(f"   Saved to: {output_dir}/")
    
    return cat_cards, num_cols

# ============== Training ==============
class CTRDataset(Dataset):
    def __init__(self, Xc, Xn, y):
        self.Xc = torch.as_tensor(Xc, dtype=torch.long)
        self.Xn = torch.as_tensor(Xn, dtype=torch.float32)
        self.y  = torch.as_tensor(y,  dtype=torch.float32)
    def __len__(self): return len(self.y)
    def __getitem__(self, i): return self.Xc[i], self.Xn[i], self.y[i].unsqueeze(-1)

def evaluate(model, dl):
    model.eval(); ys, ps = [], []
    with torch.no_grad():
        for Xc, Xn, yb in dl:
            batch_probs = []
            for i in range(0, Xc.size(0), 2048):
                Xc_chunk = Xc[i:i+2048].to(DEVICE)
                Xn_chunk = Xn[i:i+2048].to(DEVICE)
                logits = model(Xc_chunk, Xn_chunk)
                if logits.dim()==1: logits = logits.unsqueeze(1)
                probs = torch.sigmoid(logits).squeeze(-1).cpu()
                batch_probs.append(probs)
                del Xc_chunk, Xn_chunk, logits, probs
                torch.cuda.empty_cache()
            probs = torch.cat(batch_probs)
            ys.append(yb.squeeze(-1).numpy())
            ps.append(probs.numpy())
    y_true = np.concatenate(ys); y_prob = np.clip(np.concatenate(ps), 1e-7, 1-1e-7)
    return roc_auc_score(y_true, y_prob), log_loss(y_true, y_prob)

def train_outbrain_ft_afm(data_dir, output_dir, max_epochs=15, patience=5):
    """Train FT-AFM on preprocessed Outbrain data"""
    print("\n" + "="*80)
    print("TRAINING FT-AFM ON OUTBRAIN")
    print("="*80)
    
    ensure_dir(output_dir)
    
    # Load data
    print("Loading preprocessed data...")
    Xc_tr = np.load(f'{data_dir}/Xc_train.npy')
    Xn_tr = np.load(f'{data_dir}/Xn_train.npy')
    y_tr = np.load(f'{data_dir}/y_train.npy')
    
    Xc_va = np.load(f'{data_dir}/Xc_val.npy')
    Xn_va = np.load(f'{data_dir}/Xn_val.npy')
    y_va = np.load(f'{data_dir}/y_val.npy')
    
    Xc_te = np.load(f'{data_dir}/Xc_test.npy')
    Xn_te = np.load(f'{data_dir}/Xn_test.npy')
    y_te = np.load(f'{data_dir}/y_test.npy')
    
    with open(f'{data_dir}/schema.json') as f:
        schema = json.load(f)
    
    cat_cards = schema['cat_cards']
    n_num = len(schema['num_cols'])
    
    print(f"Train: {len(y_tr):,}, Val: {len(y_va):,}, Test: {len(y_te):,}")
    print(f"Features: {len(cat_cards)} cat + {n_num} num")
    
    # Data loaders
    batch_size = 2048 * NUM_GPUS if NUM_GPUS > 1 else 2048
    tr_dl = DataLoader(CTRDataset(Xc_tr, Xn_tr, y_tr), batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
    va_dl = DataLoader(CTRDataset(Xc_va, Xn_va, y_va), batch_size=batch_size*2, shuffle=False, num_workers=4, pin_memory=True)
    te_dl = DataLoader(CTRDataset(Xc_te, Xn_te, y_te), batch_size=batch_size*2, shuffle=False, num_workers=4, pin_memory=True)
    
    # Build model
    model = ImprovedFTAFM(cat_cards, n_num, d_model=192, nhead=8, ff=512, n_layers=3, dropout=0.15, afm_attn_dim=64).to(DEVICE)
    
    if NUM_GPUS > 1:
        model = nn.DataParallel(model)
    
    # Initialize bias
    base_ctr = float(y_tr.mean())
    last_linear = model.module.head[-1] if isinstance(model, nn.DataParallel) else model.head[-1]
    with torch.no_grad():
        last_linear.bias.fill_(math.log(base_ctr/(1.0-base_ctr)))
    
    # Training
    opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=5e-5)
    scaler = GradScaler(device='cuda', enabled=torch.cuda.is_available())
    best_ll, best_state, stale = float("inf"), None, 0
    history = []
    
    for ep in range(1, max_epochs+1):
        model.train(); run = 0.0
        for Xc, Xn, yb in tr_dl:
            Xc, Xn, yb = Xc.to(DEVICE), Xn.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad(set_to_none=True)
            with autocast(device_type='cuda', enabled=torch.cuda.is_available()):
                logits = model(Xc, Xn)
                if logits.dim()==1: logits = logits.unsqueeze(1)
                loss = F.binary_cross_entropy_with_logits(logits, yb)
            scaler.scale(loss).backward()
            scaler.step(opt); scaler.update()
            run += loss.item() * yb.size(0)
        
        val_auc, val_ll = evaluate(model, va_dl)
        train_ll = run / len(tr_dl.dataset)
        improved = val_ll < best_ll
        
        if improved:
            best_ll = val_ll; stale = 0
            model_to_save = model.module if isinstance(model, nn.DataParallel) else model
            best_state = {k: v.cpu().clone() for k,v in model_to_save.state_dict().items()}
        else:
            stale += 1
        
        history.append({"epoch": ep, "train_ll": train_ll, "val_ll": val_ll, "val_auc": val_auc})
        print(f"Epoch {ep:02d} | train_ll={train_ll:.4f} val_ll={val_ll:.4f} val_auc={val_auc:.4f} {'*BEST*' if improved else f'stale {stale}/{patience}'}")
        
        if stale >= patience:
            print("Early stopped.")
            break
    
    # Restore best
    if best_state:
        model_to_load = model.module if isinstance(model, nn.DataParallel) else model
        model_to_load.load_state_dict(best_state)
        torch.save(best_state, f'{output_dir}/outbrain_ft_afm_best.pth')
    
    pd.DataFrame(history).to_csv(f'{output_dir}/outbrain_history.csv', index=False)
    
    # Final evaluation
    print("\nEvaluating on test set...")
    test_auc, test_ll = evaluate(model, te_dl)
    
    results = {
        "test_auc": float(test_auc),
        "test_logloss": float(test_ll),
        "val_auc": float(max([h['val_auc'] for h in history])),
        "val_logloss": float(min([h['val_ll'] for h in history]))
    }
    save_json(results, f'{output_dir}/outbrain_results.json')
    
    print("\n" + "="*80)
    print("OUTBRAIN FT-AFM RESULTS")
    print("="*80)
    print(f"Test AUC:     {test_auc:.4f}")
    print(f"Test LogLoss: {test_ll:.4f}")
    print(f"Saved to: {output_dir}/")
    
    return test_auc, test_ll

# ============== MAIN RUNNER ==============
def run_outbrain_complete(
    data_dir="/ctr_project/data/outbrain",
    preprocess_dir="outbrain_preprocessed",
    output_dir="outbrain_ft_afm_results",
    sample_size=None  # Set to e.g., 10000000 for 10M samples
):
    """Complete Outbrain pipeline"""
    
    # Step 1: Preprocess
    print("STEP 1: Preprocessing Outbrain...")
    cat_cards, num_cols = preprocess_outbrain(data_dir, preprocess_dir, sample_size)
    
    # Step 2: Train FT-AFM
    print("\nSTEP 2: Training FT-AFM...")
    test_auc, test_ll = train_outbrain_ft_afm(preprocess_dir, output_dir)
    
    print("\n✅ OUTBRAIN PIPELINE COMPLETE!")
    return test_auc, test_ll

# ============== USAGE ==============

# Full dataset (adjust sample_size based on your needs)
run_outbrain_complete(
    data_dir="ctr_project/data/outbrain",
    preprocess_dir="outbrain_preprocessed",
    output_dir="outbrain_ft_afm_results",
    sample_size=20000000  # Use 20M samples for reasonable training time
)
"""
# Test on smaller sample first
run_outbrain_complete(
    data_dir="ctr_project/data/outbrain",
    preprocess_dir="outbrain_preprocessed_1m",
    output_dir="outbrain_ft_afm_results_1m",
    sample_size=1000000
)
"""
print("="*80)
print("OUTBRAIN PIPELINE READY")
print("="*80)
print("\nDownload Outbrain from:")
print("  https://www.kaggle.com/c/outbrain-click-prediction/data")
print("\nFiles needed:")
print("  - clicks_train.csv")
print("  - events.csv")
print("  - promoted_content.csv")
print("\nPlace in: /ctr_project/data/outbrain/")
print("\nThen run: run_outbrain_complete()")

STEP 1: Preprocessing Outbrain...

OUTBRAIN PREPROCESSING
Loading clicks_train.csv...
  Loaded 87,141,731 clicks
  Sampling 20,000,000 rows...
Loading events.csv...


  events = pd.read_csv(f"{data_dir}/events.csv")


  Loaded 23,120,126 events
Loading promoted_content.csv...
  Loaded 559,583 promoted content
Merging datasets...
  Merged dataset: 20,000,000 rows
  CTR: 0.1937
Extracting temporal features...
Handling missing values...
Parsing geo_location...
Splitting data temporally (8:1:1)...
Train: 16,000,000, Val: 2,000,000, Test: 2,000,000
Train CTR: 0.1938
Val CTR: 0.1923
Test CTR: 0.1945
Encoding categorical features...
  uuid: 11930 categories
  platform: 4 categories
  geo_location: 2145 categories
  ad_id: 62260 categories
  document_id_y: 33741 categories
  campaign_id: 20441 categories
  advertiser_id: 3351 categories
  country: 220 categories
  state: 394 categories
Engineering frequency features...
Target encoding...
Standardizing numerics...
Saving preprocessed data...
✅ Preprocessing complete!
   Categorical features: 9
   Numerical features: 8
   Saved to: outbrain_preprocessed/

STEP 2: Training FT-AFM...

TRAINING FT-AFM ON OUTBRAIN
Loading preprocessed data...
Train: 16,000,000, V



Epoch 01 | train_ll=0.4430 val_ll=0.4453 val_auc=0.7092 *BEST*
Epoch 02 | train_ll=0.4415 val_ll=0.4445 val_auc=0.7112 *BEST*
Epoch 03 | train_ll=0.4402 val_ll=0.4447 val_auc=0.7116 stale 1/5
Epoch 04 | train_ll=0.4389 val_ll=0.4439 val_auc=0.7124 *BEST*
Epoch 05 | train_ll=0.4375 val_ll=0.4440 val_auc=0.7118 stale 1/5
Epoch 06 | train_ll=0.4359 val_ll=0.4456 val_auc=0.7116 stale 2/5
Epoch 07 | train_ll=0.4340 val_ll=0.4461 val_auc=0.7104 stale 3/5
Epoch 08 | train_ll=0.4320 val_ll=0.4477 val_auc=0.7090 stale 4/5
Epoch 09 | train_ll=0.4300 val_ll=0.4494 val_auc=0.7072 stale 5/5
Early stopped.

Evaluating on test set...

OUTBRAIN FT-AFM RESULTS
Test AUC:     0.6986
Test LogLoss: 0.4530
Saved to: outbrain_ft_afm_results/

✅ OUTBRAIN PIPELINE COMPLETE!
OUTBRAIN PIPELINE READY

Download Outbrain from:
  https://www.kaggle.com/c/outbrain-click-prediction/data

Files needed:
  - clicks_train.csv
  - events.csv
  - promoted_content.csv

Place in: /ctr_project/data/outbrain/

Then run: run_out

In [8]:
test_auc, test_ll = run_outbrain_complete(
    data_dir="ctr_project/data/outbrain",
    preprocess_dir="outbrain_preprocessed_40m",
    output_dir="outbrain_ft_afm_results_40m",
    sample_size=40_000_000
)

print(test_auc, test_ll)


STEP 1: Preprocessing Outbrain...

OUTBRAIN PREPROCESSING
Loading clicks_train.csv...
  Loaded 87,141,731 clicks
  Sampling 40,000,000 rows...
Loading events.csv...


  events = pd.read_csv(f"{data_dir}/events.csv")


  Loaded 23,120,126 events
Loading promoted_content.csv...
  Loaded 559,583 promoted content
Merging datasets...
  Merged dataset: 40,000,000 rows
  CTR: 0.1937
Extracting temporal features...
Handling missing values...
Parsing geo_location...
Splitting data temporally (8:1:1)...
Train: 32,000,000, Val: 4,000,000, Test: 4,000,000
Train CTR: 0.1938
Val CTR: 0.1922
Test CTR: 0.1945
Encoding categorical features...
  uuid: 118343 categories
  platform: 5 categories
  geo_location: 2400 categories
  ad_id: 97742 categories
  document_id_y: 46594 categories
  campaign_id: 23087 categories
  advertiser_id: 3495 categories
  country: 226 categories
  state: 397 categories
Engineering frequency features...
Target encoding...
Standardizing numerics...
Saving preprocessed data...
✅ Preprocessing complete!
   Categorical features: 9
   Numerical features: 8
   Saved to: outbrain_preprocessed_40m/

STEP 2: Training FT-AFM...

TRAINING FT-AFM ON OUTBRAIN
Loading preprocessed data...
Train: 32,000,0



Epoch 01 | train_ll=0.4418 val_ll=0.4450 val_auc=0.7103 *BEST*
Epoch 02 | train_ll=0.4399 val_ll=0.4444 val_auc=0.7133 *BEST*
Epoch 03 | train_ll=0.4386 val_ll=0.4428 val_auc=0.7146 *BEST*
Epoch 04 | train_ll=0.4373 val_ll=0.4425 val_auc=0.7148 *BEST*
Epoch 05 | train_ll=0.4358 val_ll=0.4427 val_auc=0.7144 stale 1/5
Epoch 06 | train_ll=0.4339 val_ll=0.4431 val_auc=0.7141 stale 2/5
Epoch 07 | train_ll=0.4316 val_ll=0.4442 val_auc=0.7127 stale 3/5
Epoch 08 | train_ll=0.4293 val_ll=0.4461 val_auc=0.7128 stale 4/5
Epoch 09 | train_ll=0.4268 val_ll=0.4467 val_auc=0.7114 stale 5/5
Early stopped.

Evaluating on test set...

OUTBRAIN FT-AFM RESULTS
Test AUC:     0.7003
Test LogLoss: 0.4516
Saved to: outbrain_ft_afm_results_40m/

✅ OUTBRAIN PIPELINE COMPLETE!
0.7003085858210122 0.45155954123671616
