In [1]:
from pathlib import Path

import pandas as pd

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [2]:
def write_rf_hchain_to_fasta(df, fp):
    records = []
    for _, s in df[["CloneID", "NucSeqVH"]].iterrows():
        hchain_seq = s["NucSeqVH"]
        clone_id = s["CloneID"]
        if len(hchain_seq) > 1:
            records.append(SeqRecord(
                id=f"{clone_id}_heavy",
                seq=Seq(hchain_seq),
                description="",
            ))
    SeqIO.write(records, fp, "fasta")

In [3]:
public_rfs_dir = Path("/data/samples/AIRR-Seq/OURS/public-rfs")
dataset_path = public_rfs_dir / "Falkenburg_database_rf_sequences_sheet1.tsv"
hchain_fasta_path = public_rfs_dir / "Falkenburg_hchains.fasta"
hchain_airr_path = public_rfs_dir / "Falkenburg_hchains_igblast_airr.tsv"

df_falk = pd.read_csv(dataset_path, sep = '\t')
write_rf_hchain_to_fasta(df_falk, hchain_fasta_path)

In [4]:
!AssignGenes.py igblast \
    -s {hchain_fasta_path} \
    -o {hchain_airr_path} \
    -b /usr/local/share/igblast \
    --organism human \
    --loci ig \
    --format airr \
    --nproc 8

   START> AssignGenes
 COMMAND> igblast
 VERSION> 1.17.0
    FILE> Falkenburg_hchains.fasta
ORGANISM> human
    LOCI> ig
   NPROC> 8

PROGRESS> 11:41:50 |Done                     | 0.0 min

OUTPUT> Falkenburg_hchains_igblast_airr.tsv
   END> AssignGenes

