In [1]:
from collections import Counter

import pandas as pd
import polars as pl

import abutils
import abstar

Import and Align SARS-CoV2 mAbs

In [2]:
df = pd.read_csv('../TXG-20220218.csv')
txg_df = df[df["Identifier"].str.startswith("TXG")]
heavies = [abutils.Sequence(s, id=i) for s, i in zip(txg_df["vj_seq1"], txg_df["Identifier"])]
lights = [abutils.Sequence(s, id=i) for s, i in zip(txg_df["vj_seq2"], txg_df["Identifier"])]
seqs = abstar.run(heavies + lights, output_type="airr")
pairs = abutils.core.pair.assign_pairs(seqs)


Running abstar...
(1/1) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%

478 sequences contained an identifiable rearrangement
abstar completed in 10.98 seconds



In [24]:
balm = pd.read_csv("./TXG_mAbs_BALM_maxmask2wt.csv")
balm = balm.iloc[:, 1:]

balm['pos'] = balm['mAbs_pos'].str.split('pos').str[1]
balm['pos'] = balm['pos'].astype(int)
balm['alias'] = balm['mAbs_pos'].str.split('_').str[0]

balm = balm[balm['max/wt_ratio'] > 1]

In [25]:
#balm = balm[balm['wt'] != balm['max_prob_aa']]
balm

Unnamed: 0,mAbs_pos,wt,max_prob_aa,wt_prob,max_prob,max/wt_ratio,pos,alias
2,TXG-0001_pos2,Q,<mask>,0.000280,0.000884,3.156083,2,TXG-0001
3,TXG-0001_pos3,L,<mask>,0.000313,0.002279,7.273050,3,TXG-0001
17,TXG-0001_pos17,L,<unk>,0.001024,0.003675,3.588861,17,TXG-0001
19,TXG-0001_pos19,L,<s>,0.000524,0.002967,5.658811,19,TXG-0001
26,TXG-0001_pos26,F,<s>,0.003599,0.003817,1.060580,26,TXG-0001
...,...,...,...,...,...,...,...,...
55194,TXG-0239_pos234,Q,<mask>,0.000066,0.003878,58.993024,234,TXG-0239
55195,TXG-0239_pos235,G,</s>,0.000530,0.011608,21.922151,235,TXG-0239
55196,TXG-0239_pos236,T,<pad>,0.003367,0.003978,1.181368,236,TXG-0239
55197,TXG-0239_pos237,K,<pad>,0.001343,0.001758,1.308926,237,TXG-0239


Compare Germline and Matured Abs For Accurate Predictions

In [26]:
pairs[0].heavy.annotations['sequence_id']
alii = balm['alias'].to_list()
true_muts = []
balm = balm[balm['wt'] != balm['max_prob_aa']]
rows = []
for p in pairs:
    _id = p.heavy.annotations['sequence_id']
    non_germ_seq = f'{abutils.tl.translate(p.heavy["sequence_alignment"])}<cls><cls>{abutils.tl.translate(p.light["sequence_alignment"])}'
    germ_seq = f'{abutils.tl.translate(p.heavy["germline_alignment"])}<cls><cls>{abutils.tl.translate(p.light["germline_alignment"])}'
    if _id in alii:
        sub_balm = balm[balm['alias'] == _id]
        for row in range(len(sub_balm.index)):
            for aa in range(len(germ_seq)):
                if (germ_seq[aa] == sub_balm.iloc[row, 1]) and (non_germ_seq[aa] != germ_seq[aa]):
                    if aa == sub_balm.iloc[row, 6]:
                        true_muts.append(non_germ_seq[aa])
                        rows.append(dict(sub_balm.iloc[row,]))

In [31]:
df = pd.DataFrame(rows)
df['true_mutation'] = true_muts
df = df[df['wt'] != df['max_prob_aa']]
df.to_csv('./TXG_mAbs_BALM_truepredictions_pos.csv')

In [32]:
aa_chem_l = [['A', 'G', 'I', 'L', 'M', 'V'], ['C','S', 'T', 'P', 'N', 'Q'], 
             ['D', 'E', ''], ['K', 'R', 'H'], ['F', 'Y', 'W']]
rows_chem = []
for row in range(len(df.index)):
    for l in aa_chem_l:
        if df.iloc[row, 2] in l and df.iloc[row, 8] in l:
            rows_chem.append(df.iloc[row,])

df2 = pd.DataFrame(rows_chem)
df2.to_csv('./TXG_mAbs_BALM_truepredictions_chem.csv')

In [33]:
df3 = df[df['max_prob_aa'] == df['true_mutation']]
df3.to_csv('./TXG_mAbs_BALM_truepredictions.csv')