In [31]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader


from transformers import AutoModel
from transformers import AutoTokenizer

import os
from huggingface_hub import hf_hub_download
from huggingface_hub import HfApi

import math


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")


repo_id_download = "Arthurmaffre34/pre-dataset"
repo_id_upload = "Gill-Hack-25-UdeM/Text_Embedding_Dataset"

sub_batch_size = 10

In [None]:
def download_dataset(repo_id_download: str, filename: str = "pre_dataset.parquet", private: bool = False):
    """
    Download a dataset file from the Hugging Face Hub and load it into a pandas DataFrame.
    
    Args:
        repo_id (str): Hugging Face repo ID (e.g., "Arthurmaffre34/pre-dataset").
        filename (str): The file name inside the repo (default: "pre_dataset.parquet").
        private (bool): Set to True if the repo is private (requires HF_TOKEN).
    
    Returns:
        pd.DataFrame: The loaded dataset.
    """
    file_path = hf_hub_download(
        repo_id=repo_id_download,
        filename=filename,
        repo_type="dataset",
        token=os.getenv("HF_TOKEN") if private else None
    )
    return pd.read_parquet(file_path)

def reduce_dataset(df: pd.DataFrame, frac: float = None, n: int = None, seed: int = 42) -> pd.DataFrame:
    """
    Reduce the dataset by sampling either a fraction or a fixed number of rows.
    
    Args:
        df (pd.DataFrame): Input dataset.
        frac (float, optional): Fraction of the dataset to sample (e.g., 0.01 for 1%).
        n (int, optional): Exact number of rows to sample.
        seed (int): Random seed for reproducibility (default: 42).
    
    Returns:
        pd.DataFrame: Reduced dataset.
    """
    if frac is not None:
        return df.sample(frac=frac, random_state=seed).reset_index(drop=True)
    elif n is not None:
        return df.sample(n=n, random_state=seed).reset_index(drop=True)
    else:
        raise ValueError("You must specify either `frac` or `n`.")

df = download_dataset(repo_id_download)

df_small = reduce_dataset(df, n = 2)

print(f"Dataset complet: {len(df)} lignes")
print(f"Dataset réduit : {len(df_small)} lignes")

Dataset complet: 9516 lignes
Dataset réduit : 2 lignes


In [4]:
model = AutoModel.from_pretrained("yiyanghkust/finbert-pretrain")
model.eval().to(device)
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-pretrain")

In [26]:
import torch
from torch.utils.data import Dataset
import warnings

class FinBertEmbeddingDataset(Dataset):
    def __init__(self, df, tokenizer, model, max_length=512, n_chunks=100, sub_batch_size=100, device="cpu"):
        # --- Error checks ---
        if max_length > 512:
            raise ValueError(f"❌ max_length={max_length} is not allowed. FinBERT only supports max_length ≤ 512.")
        if sub_batch_size > n_chunks:
            raise ValueError(f"❌ sub_batch_size={sub_batch_size} cannot be greater than n_chunks={n_chunks}.")

        # --- Tokenizer check ---
        expected_model_name = "yiyanghkust/finbert-pretrain"
        if getattr(tokenizer, "name_or_path", None) != expected_model_name:
            warnings.warn(
                f"⚠️ Tokenizer `{tokenizer.name_or_path}` does not match expected `{expected_model_name}`. "
                "This may cause mismatches between embeddings and vocabulary.",
                UserWarning
            )

        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.model = model.to(device)
        self.max_length = max_length
        self.n_chunks = n_chunks
        self.pad_id = tokenizer.pad_token_id
        self.sub_batch_size = sub_batch_size
        self.device = device
        self.model.eval()

    def chunk_text(self, text):
        tokens = self.tokenizer.encode(text, add_special_tokens=True)
        chunks = [tokens[i:i+self.max_length] for i in range(0, len(tokens), self.max_length)]

        ids, masks = [], []
        for chunk in chunks[:self.n_chunks]:
            attn = [1] * len(chunk)
            if len(chunk) < self.max_length:
                pad_len = self.max_length - len(chunk)
                chunk = chunk + [self.pad_id] * pad_len
                attn = attn + [0] * pad_len
            ids.append(chunk)
            masks.append(attn)

        while len(ids) < self.n_chunks:
            ids.append([self.pad_id] * self.max_length)
            masks.append([0] * self.max_length)

        return torch.tensor(ids, dtype=torch.long), torch.tensor(masks, dtype=torch.long)

    def encode_chunks(self, ids, mask, pbar=None):
        outputs = []
        with torch.no_grad():
            for i in range(0, ids.size(0), self.sub_batch_size):
                ids_sub = ids[i:i+self.sub_batch_size].to(self.device)
                mask_sub = mask[i:i+self.sub_batch_size].to(self.device)
                out = self.model(ids_sub, attention_mask=mask_sub).pooler_output
                outputs.append(out.cpu())

                if pbar is not None:
                    pbar.update(ids_sub.size(0))  # global increment
        return torch.cat(outputs, dim=0)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        rf_ids, rf_mask = self.chunk_text(str(row["rf"]))
        mgmt_ids, mgmt_mask = self.chunk_text(str(row["mgmt"]))

        rf_emb = self.encode_chunks(rf_ids, rf_mask, pbar=getattr(self, "pbar", None))
        mgmt_emb = self.encode_chunks(mgmt_ids, mgmt_mask, pbar=getattr(self, "pbar", None))

        stacked = torch.stack([rf_emb, mgmt_emb], dim=0)

        return {
            "embeddings": stacked,
            "labels": torch.tensor(row["return"], dtype=torch.float)
        }

    def __len__(self):
        return len(self.df)

In [24]:
def prepare_dataset(df, tokenizer, model, sub_batch_size=32, device="cpu", n_chunks=100, batch_size=1):
    """
    Encode a dataset into FinBERT embeddings with progress bar.
    
    Args:
        df (pd.DataFrame): Input dataframe with "rf", "mgmt", and "return".
        tokenizer: Hugging Face tokenizer (e.g., FinBERT tokenizer).
        model: Hugging Face model (e.g., FinBERT model).
        sub_batch_size (int): Number of chunks processed at once inside FinBERT.
        device (str): "cpu" or "cuda".
        n_chunks (int): Number of chunks per document.
        batch_size (int): DataLoader batch size (default=1).
    
    Returns:
        (torch.Tensor, torch.Tensor): embeddings [num_samples, 2, N, hidden_dim], labels [num_samples]
    """
    dataset = FinBertEmbeddingDataset(
        df, tokenizer, model,
        sub_batch_size=sub_batch_size,
        device=device,
        n_chunks=n_chunks
    )
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    total_chunks = len(dataset) * 2 * dataset.n_chunks  # RF + MGMT
    all_embeddings, all_labels = [], []

    with tqdm(total=total_chunks, desc="Encoding all chunks") as pbar:
        dataset.pbar = pbar
        for batch in loader:
            all_embeddings.append(batch["embeddings"])
            all_labels.append(batch["labels"])
        dataset.pbar = None  # cleanup

    embeddings = torch.cat(all_embeddings)   # [num_samples, 2, N, hidden_dim]
    labels = torch.cat(all_labels)           # [num_samples]
    return embeddings, labels

embeddings, labels = prepare_dataset(
    df=df_small,
    tokenizer=tokenizer,
    model=model,
    sub_batch_size=16,
    device=device,
    n_chunks=100,
    batch_size=1
)

print("✅ Embeddings shape:", embeddings.shape)
print("✅ Labels shape:", labels.shape)

Encoding all chunks: 100%|██████████| 400/400 [00:22<00:00, 17.74it/s]

✅ Embeddings shape: torch.Size([2, 2, 100, 768])
✅ Labels shape: torch.Size([2])





In [28]:
print("\n=== Vérification dataset ===")
print("Embeddings shape :", embeddings.shape)
print("Labels shape     :", labels.shape)
print("Type embeddings  :", embeddings.dtype)
print("Type labels      :", labels.dtype)
print("Exemple label    :", labels[0].item())
print("Exemple batch    :", embeddings[0].shape)  # [2, N, hidden_dim]


=== Vérification dataset ===
Embeddings shape : torch.Size([2, 2, 100, 768])
Labels shape     : torch.Size([2])
Type embeddings  : torch.float32
Type labels      : torch.float32
Exemple label    : 0.06242643669247627
Exemple batch    : torch.Size([2, 100, 768])


In [None]:
def save_sharded_dataset(embeddings, labels, max_file_size_gb=1, prefix="finbert_embeddings_part"):
    """
    Save embeddings + labels into multiple shards based on target file size.

    Args:
        embeddings (torch.Tensor): Tensor of shape [N, ...].
        labels (torch.Tensor): Tensor of shape [N].
        max_file_size_gb (int): Maximum shard size in gigabytes (default=1 GB).
        prefix (str): Prefix for saved shard files.

    Returns:
        int: Number of shards created.
    """
    # Estimate bytes per sample
    bytes_per_sample = (
        embeddings[0].element_size() * embeddings[0].numel() +
        labels[0].element_size()
    )
    shard_size = max(1, int((max_file_size_gb * (1024**3)) // bytes_per_sample))

    n_samples = len(embeddings)
    n_shards = math.ceil(n_samples / shard_size)

    print(f"📦 Target max file size: {max_file_size_gb} GB")
    print(f"⚖️  Estimated {bytes_per_sample/1024:.2f} KB per sample")
    print(f"➡️  {shard_size} samples per shard → {n_shards} shards total")

    for i in range(n_shards):
        start = i * shard_size
        end = min((i+1) * shard_size, n_samples)
        filename = f"{prefix}{i}.pt"
        torch.save(
            {"embeddings": embeddings[start:end], "labels": labels[start:end]},
            filename
        )
        file_size_mb = os.path.getsize(filename) / (1024**2)
        print(f"✅ Saved {filename} ({file_size_mb:.2f} MB, {end-start} samples)")

    return n_shards

# Save shards targeting ~500 MB each
n_shards = save_sharded_dataset(embeddings, labels, max_file_size_gb=0.5)

print(f"✅ Dataset saved in {n_shards} shards")

📦 Target max file size: 0.5 GB
⚖️  Estimated 600.00 KB per sample
➡️  873 samples per shard → 1 shards total
✅ Saved finbert_embeddings_part0.pt (1.17 MB, 2 samples)
✅ Dataset saved in 1 shards


In [32]:
def upload_dataset_shards_to_hf(repo_id_upload, n_shards, prefix="finbert_embeddings_part", repo_type="dataset"):
    from huggingface_hub import HfApi
    api = HfApi()
    for i in range(n_shards):
        filename = f"{prefix}{i}.pt"
        api.upload_file(
            path_or_fileobj=filename,
            path_in_repo=filename,
            repo_id=repo_id_upload,
            repo_type=repo_type,
            create_pr=True
        )
        print(f"🔼 Upload {filename} vers HF terminé")
upload_dataset_shards_to_hf(repo_id_upload, n_shards)

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  finbert_embeddings_part0.pt           : 100%|##########| 1.23MB / 1.23MB            

🔼 Upload finbert_embeddings_part0.pt vers HF terminé
