In [None]:

# 03_text_embeddings.ipynb
%pip install -q torch transformers datasets tqdm pandas

import torch, pandas as pd, numpy as np
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
from tqdm.auto import tqdm

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL  = "xlm-roberta-base"
BATCH  = 16

RAW   = Path("data/raw")
PROC  = Path("data/processed")
EMB   = Path("data/embeddings"); EMB.mkdir(parents=True, exist_ok=True)

tok = AutoTokenizer.from_pretrained(MODEL)
enc = AutoModel.from_pretrained(MODEL).to(DEVICE); enc.eval()

def embed_texts(texts):
    with torch.no_grad():
        tk = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(DEVICE)
        out = enc(**tk).last_hidden_state
        mask = tk.attention_mask.unsqueeze(-1)
        mean = (out*mask).sum(dim=1)/mask.sum(dim=1).clamp(min=1)
        return mean.detach().cpu().numpy()

def embed_df_texts(df, col, out_path):
    arrs = []
    texts = df[col].astype(str).tolist()
    for i in tqdm(range(0, len(texts), BATCH)):
        arrs.append(embed_texts(texts[i:i+BATCH]))
    embs = np.vstack(arrs) if arrs else np.zeros((0,768))
    np.save(out_path, embs); print("Saved:", out_path, embs.shape)

# LendingClub splits
lc_train = pd.read_parquet(PROC/"lc_train.parquet")
lc_valid = pd.read_parquet(PROC/"lc_valid.parquet")
lc_test  = pd.read_parquet(PROC/"lc_test.parquet")
for name, df in [("train",lc_train),("valid",lc_valid),("test",lc_test)]:
    embed_df_texts(df, "description", EMB/f"lc_{name}_emb.npy")

# Auxiliary corpora (optional enrichment)
aux_paths = [
    RAW/"multifin_text.csv",
    RAW/"multifinben_en_text.csv",
    RAW/"multifinben_es_text.csv",
    RAW/"maec_transcripts.csv",
]
for p in aux_paths:
    if p.exists():
        df = pd.read_csv(p)
        col = "text" if "text" in df.columns else df.columns[0]
        df = df.sample(min(50000, len(df)), random_state=42)
        embed_df_texts(df, col, EMB/f"{p.stem}_emb.npy")

print("Embedding generation complete.")
