In [1]:
from collections import Counter

import pandas as pd
import polars as pl

import abutils
import abstar

Import and Align SARS-CoV2 mAbs

In [2]:
df = pd.read_csv('../TXG-20220218.csv')
txg_df = df[df["Identifier"].str.startswith("TXG")]
heavies = [abutils.Sequence(s, id=i) for s, i in zip(txg_df["vj_seq1"], txg_df["Identifier"])]
lights = [abutils.Sequence(s, id=i) for s, i in zip(txg_df["vj_seq2"], txg_df["Identifier"])]
seqs = abstar.run(heavies + lights, output_type="airr")
pairs = abutils.core.pair.assign_pairs(seqs)


Running abstar...
(1/1) ||||||||||||||||||||||||||||||||||||||||||||||||||||  100%

478 sequences contained an identifiable rearrangement
abstar completed in 19.58 seconds



In [17]:
balm = pd.read_csv("./TXG_mAbs_BALMMoE_maxmask2wt.csv")
balm = balm.iloc[:, 1:]

balm['pos'] = balm['mAbs_pos'].str.split('pos').str[1]
balm['pos'] = balm['pos'].astype(int)
balm['alias'] = balm['mAbs_pos'].str.split('_').str[0]

balm = balm[balm['max/wt_ratio'] > 1]

In [18]:
#balm = balm[balm['wt'] != balm['max_prob_aa']]
balm

Unnamed: 0,mAbs_pos,wt,max_prob_aa,wt_prob,max_prob,max/wt_ratio,pos,alias
0,TXG-0001_pos0,Q,<pad>,2.329855e-04,0.000827,3.551209,0,TXG-0001
2,TXG-0001_pos2,Q,<cls>,1.061625e-03,0.001390,1.309399,2,TXG-0001
3,TXG-0001_pos3,L,-,6.574083e-04,0.001202,1.827829,3,TXG-0001
6,TXG-0001_pos6,S,<pad>,4.413233e-06,0.000360,81.525866,6,TXG-0001
12,TXG-0001_pos12,Q,<pad>,1.723004e-04,0.000758,4.396997,12,TXG-0001
...,...,...,...,...,...,...,...,...
54979,TXG-0239_pos236,K,B,2.354008e-07,0.000031,131.238602,236,TXG-0239
54980,TXG-0239_pos237,L,<pad>,2.189739e-07,0.000081,371.451718,237,TXG-0239
54981,TXG-0239_pos238,E,<pad>,2.689628e-07,0.000040,150.198451,238,TXG-0239
54982,TXG-0239_pos239,I,<pad>,1.762243e-07,0.000066,373.783042,239,TXG-0239


Compare Germline and Matured Abs For Accurate Predictions

In [19]:
pairs[0].heavy.annotations['sequence_id']
alii = balm['alias'].to_list()
true_muts = []
balm = balm[balm['wt'] != balm['max_prob_aa']]
rows = []
for p in pairs:
    _id = p.heavy.annotations['sequence_id']
    non_germ_seq = f'{abutils.tl.translate(p.heavy["sequence_alignment"])}<cls><cls>{abutils.tl.translate(p.light["sequence_alignment"])}'
    germ_seq = f'{abutils.tl.translate(p.heavy["germline_alignment"])}<cls><cls>{abutils.tl.translate(p.light["germline_alignment"])}'
    if _id in alii:
        sub_balm = balm[balm['alias'] == _id]
        for row in range(len(sub_balm.index)):
            for aa in range(len(germ_seq)):
                if (germ_seq[aa] == sub_balm.iloc[row, 1]) and (non_germ_seq[aa] != germ_seq[aa]):
                    if aa == sub_balm.iloc[row, 6]:
                        true_muts.append(non_germ_seq[aa])
                        rows.append(dict(sub_balm.iloc[row,]))

In [20]:
df = pd.DataFrame(rows)
df['true_mutation'] = true_muts
df = df[df['wt'] != df['max_prob_aa']]
df.to_csv('./TXG_mAbs_BALMMoE_truepredictions_pos.csv')

In [21]:
aa_chem_l = [['A', 'G', 'I', 'L', 'M', 'V'], ['C','S', 'T', 'P', 'N', 'Q'], 
             ['D', 'E', ''], ['K', 'R', 'H'], ['F', 'Y', 'W']]
rows_chem = []
for row in range(len(df.index)):
    for l in aa_chem_l:
        if df.iloc[row, 2] in l and df.iloc[row, 8] in l:
            rows_chem.append(df.iloc[row,])

df2 = pd.DataFrame(rows_chem)
df2.to_csv('./TXG_mAbs_BALMMoE_truepredictions_chem.csv')

In [22]:
df3 = df[df['max_prob_aa'] == df['true_mutation']]
df3.to_csv('./TXG_mAbs_BALMMoE_truepredictions.csv')