In [1]:
!pip install antiberty
!pip install pandas torch

Looking in links: /path/to/your/local/wheel/directory
Looking in links: /path/to/your/local/wheel/directory


In [4]:
import os

input_dir = "../cdr3_outputs/h/"
output_dir = "../cdr3_outputs/embedded/"

os.makedirs(output_dir, exist_ok=True)


In [5]:
import pandas as pd
import torch
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
from antiberty import AntiBERTyRunner
from tqdm.notebook import tqdm  # Classic terminal/CLI progress bar

runner = AntiBERTyRunner()
runner.model.eval()
if torch.cuda.is_available():
    runner.model.cuda()


def embed_unique_cdr3s(cdr3_list):
    seen = {}
    #gpu mem usage ~batch_size^2 KB
    batch_size = 1500
    total_batches = (len(cdr3_list) + batch_size - 1) // batch_size

    with tqdm(total=total_batches, desc="🧬 Embedding unique CDR3s") as pbar:
        for i in range(0, len(cdr3_list), batch_size):
            batch = cdr3_list[i:i+batch_size]
            to_embed = [cdr3 for cdr3 in batch if cdr3 not in seen]
            if not to_embed:
                pbar.update(1)
                continue
            
            with torch.no_grad():
                reps = runner.embed(to_embed)
            for cdr3, rep in zip(to_embed, reps):
                pooled = rep[1:-1].mean(dim=0).cpu().numpy()
                seen[cdr3] = pooled
            pbar.update(1)
    return seen


In [None]:
embedding_cache = {}
print(os.getcwd())


tsv_files = [f for f in os.listdir(input_dir) if f.endswith(".csv")]
for filename in tqdm(tsv_files, total=len(tsv_files), desc="📄 Processing TSV files"):
    if not filename.endswith(".csv"):
        continue

    file_path = os.path.join(input_dir, filename)
    print(f"Processing: {filename}")

    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"Failed to read {filename}: {e}")
        continue

    if 'cdr3_aa' not in df.columns or 'stage' not in df.columns:
        print(f"[!] Skipping {filename} - missing required columns.")
        continue
    
    cdr3s = df['cdr3_aa'].dropna().tolist()
    new_cdr3s = [c for c in set(cdr3s) if c not in embedding_cache]

    if new_cdr3s:
        new_embs = embed_unique_cdr3s(new_cdr3s)
        embedding_cache.update(new_embs)

    unique_cdr3s = set(cdr3s)
    emb_rows = [[c] + embedding_cache[c].tolist() for c in unique_cdr3s if c in embedding_cache]
    emb_df = pd.DataFrame(emb_rows)

    emb_df.columns = ['cdr3_aa'] + [f'dim{i}' for i in range(512)]

    merged = pd.merge(df[['cdr3_aa', 'stage']], emb_df, on='cdr3_aa', how='left')
    output_path = os.path.join(output_dir, filename.replace(".tsv", "_with_antiberty.csv"))
    merged.to_csv(output_path, index=False)

    print(f"Saved: {output_path}")


/home/fhici/documents/stats_and_R/subset_changeo_to_single_clones_v2/scripts


📄 Processing TSV files:   0%|          | 0/6 [00:00<?, ?it/s]

Processing: D_H_T.tsv_cdr3_only.csv


🧬 Embedding unique CDR3s:   0%|          | 0/459 [00:00<?, ?it/s]

In [12]:
import gc
import torch

# Delete large objects (e.g., DataFrames or tensors)
gc.collect()

# Clear CUDA memory
torch.cuda.empty_cache()
os.kill(os.getpid(), 9)



: 

In [14]:
!jupyter nbextension enable --py widgetsnbextension


usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook run server troubleshoot trust

Jupyter command `jupyter-nbextension` not found.


In [8]:
rm -rf /content/tsvs/

In [None]:
import os
os.kill(os.getpid(), 9)
