# Inference Data Pre-processing

separate dataset B into unmutated (germline) and mutated sequences

note: this notebook requires test-set annotations (for mutation counts) and the paired test set with CDR masks (for subsequent grouping of predictions by antibody region in `2-per_position_inference`)

In [1]:
import pandas as pd

In [2]:
%%bash

# download test set (CDR masks) if file doesn't already exist (downloads all pre-training data, since files are zipped together on Zenodo)
if [ ! -e "../data/B_test.csv" ]; then
    curl -o 'train-eval-test_cdr-mask.tar.gz' -L 'https://zenodo.org/records/14019655/files/train-eval-test_cdr-mask.tar.gz?download=1'
    tar xzvf 'train-eval-test_cdr-mask.tar.gz' -C ../data
    rm 'train-eval-test_cdr-mask.tar.gz'
fi

# download test set annotations if they don't already exist
if [ ! -e "../data/B_test-set_annotations.csv" ]; then
    curl -o 'test-set_annotations.tar.gz' -L 'https://zenodo.org/records/14019655/files/test-set_annotations.tar.gz?download=1'
    tar xzvf 'test-set_annotations.tar.gz' -C ../data
    rm 'test-set_annotations.tar.gz'
fi

In [3]:
# annotated test set
df = pd.read_csv("../data/B_test-set_annotations.csv")
df.shape

(129814, 22)

In [4]:
df.columns

Index(['sequence_id', 'sequence_aa', 'v_gene', 'd_gene', 'j_gene',
       'junction_aa', 'fwr1_aa', 'cdr1_aa', 'fwr2_aa', 'cdr2_aa', 'fwr3_aa',
       'cdr3_aa', 'fwr4_aa', 'v_identity', 'v_identity_aa', 'v_mutations',
       'v_mutations_aa', 'v_insertions', 'v_deletions', 'isotype', 'locus',
       'sequence'],
      dtype='object')

## re-pair the data and count mutations

repair using `sequence_id`, count using `v_mutations_aa` (could also use `v_identity` for similar results)

In [5]:
# count mutations
counts = []
for row in df["v_mutations_aa"]:
    if isinstance(row, str):
        counts.append(row.count(":"))
    else:
        counts.append(0)

df["v_mutation_count_aa"] = pd.Series(counts, name="v_mutation_count_aa")

In [6]:
# columns of interest (id, sequence, mutation count)
columns = ["sequence_id", "sequence_aa", "v_mutation_count_aa"]

h = df[df["locus"] == "IGH"].loc[:, columns]
h.rename(columns={"sequence_aa": "heavy_chain", 
                  "v_mutation_count_aa": "v_mutation_count_heavy",}, inplace=True)

l = df[df["locus"] != "IGH"].loc[:, columns]
l.rename(columns={"sequence_aa": "light_chain", 
                  "v_mutation_count_aa": "v_mutation_count_light",}, inplace=True)

In [7]:
# pair the sequences
paired = pd.merge(h, l, on="sequence_id")
paired["sequence_aa"] = paired["heavy_chain"] + "<cls><cls>" + paired["light_chain"]

# total v-gene mutation count
paired["v_all_mutation_count"] = paired["v_mutation_count_heavy"] + paired["v_mutation_count_light"]

paired.head(5)

Unnamed: 0,sequence_id,heavy_chain,v_mutation_count_heavy,light_chain,v_mutation_count_light,sequence_aa,v_all_mutation_count
0,00042cf7-cc92-aa6f-ca45-fe5c83987ffb,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLE...,0,QSVLTQPPSASGTPGQRVTISCSGSSSNIGSNYVYWYQQLPGTAPK...,0,QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLE...,0
1,00044d0a-d75a-3f0e-292c-00328712873e,QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,0,DIQMTQSPSSLSASVGDRVTITCRASQGISNYLAWYQQKPGKVPKL...,0,QVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLE...,0
2,000d3462-cfa3-b4bd-6ad1-25e25a5af361,QVQLVESGGGVVQPGGSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...,0,SSELTQDPAVSVALGQTVRITCQGDSLRSYYASWYQQKPGQAPVLV...,0,QVQLVESGGGVVQPGGSLRLSCAASGFTFSSYGMHWVRQAPGKGLE...,0
3,000d5166-f467-05bb-ab9d-56d67a24a207,QVQLVQSGPEVKESGASVRVSCKASGLSFTSFGFSWVRQAPGQGLE...,17,QSALTQPASVSGSPGQSITISCTGTSSDVGGYKYVSWYQQNPGKAP...,5,QVQLVQSGPEVKESGASVRVSCKASGLSFTSFGFSWVRQAPGQGLE...,22
4,00105f68-19a9-e309-39fb-43bc468e72ab,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYAMHWVRQAPGKGLE...,0,DIQMTQSPSSVSASVGDRVTITCRASQGISSWLAWYQQKPGKAPKL...,0,QVQLVESGGGVVQPGRSLRLSCAASGFTFSSYAMHWVRQAPGKGLE...,0


## select germline and mutated sequences, join to df with cdr masks

In [8]:
test_df = pd.read_csv("../data/B_test.csv")
test_df.shape

(64907, 2)

In [9]:
# germline sequences (no v-gene mutations)
germline = paired[paired["v_all_mutation_count"] == 0].copy()
germline["text"] = germline["heavy_chain"] + "<cls><cls>" + germline["light_chain"]

# add cdr masks
germline = germline["text"].to_frame().merge(test_df, on="text")
germline.shape

(38670, 2)

In [10]:
# mutated sequences (gonna take ones where both chains have mutations ,, just so that they are more mutated? idk)
mutated = paired[paired["v_all_mutation_count"] != 0].copy()
mutated["text"] = mutated["heavy_chain"] + "<cls><cls>" + mutated["light_chain"]

# add cdr masks
mutated = mutated["text"].to_frame().merge(test_df, on="text")
mutated.shape

(26237, 2)

## save dfs

for inference separated by germline and mutated sequences

In [11]:
germline.to_csv("./B_germline_annotated_test.csv", index=False)
mutated.to_csv("./B_mutated_annotated_test.csv", index=False)