In [1]:
import os 
import sys
sys.path.append("../src")

In [2]:
from antiberty_model import Antiberty
from ablang_model import Ablang
from ESM_model import ESM
from sapiens_model import Sapiens
from protbert import ProtBert

In [4]:
import pandas as pd
import numpy as np
from Bio.Align import PairwiseAligner

def compute_evo_velocity(sequence_1, sequence_2, model):

    prob_mat_1 = model.calc_probability_matrix(sequence_1)
    prob_mat_2 = model.calc_probability_matrix(sequence_2)

    aligner = PairwiseAligner() 
    aligner.extend_gap_score = -0.1
    aligner.match_score = 5
    aligner.mismatch_score = -4
    aligner.open_gap_score = -4

    alignment = aligner.align(sequence_1,sequence_2)[0]
    alignment_pos = alignment.aligned

    ranges_1 = alignment_pos[0,:,:]
    ranges_2 = alignment_pos[1,:,:]

    count = 0
    evo_velo = 0

    for i in range(ranges_1.shape[0]):
        start_1 = ranges_1[i,0]
        start_2 = ranges_2[i,0]
        subalign_len = ranges_1[i,1] - start_1

        for j in range(subalign_len):

            pos_1 = start_1 + j
            pos_2 = start_2 + j   

            amino_acid_1  = sequence_1[pos_1]
            amino_acid_2  = sequence_2[pos_2]

            if amino_acid_1 != amino_acid_2:

                evo_velo += (prob_mat_1.loc[pos_1,amino_acid_2] - prob_mat_2.loc[pos_2,amino_acid_1])
                count += 1

    if count == 0:
        return 0
    else:
        evo_velo /= count
        return evo_velo 



In [5]:
data = pd.read_csv("../../../data/OVA_mouse/vdj_evolike_combine.csv")
model = Ablang(chain="heavy")   

In [6]:
output_dict = {}
from tqdm import tqdm
def unique_id_gen(series):
    return list(series)[0]

for sample in pd.unique(data["sample_id"]):
    
    data_sample = data.loc[data["sample_id"] == sample,:]

    for clonotype in tqdm(pd.unique(data_sample["clonotype_id_10x"])):

        data_sample_clonotype = data_sample.loc[data_sample["clonotype_id_10x"] == clonotype, : ]
        data_sample_clonotype = data_sample_clonotype.groupby("VDJ_sequence_aa")["barcode"].agg(lambda x: unique_id_gen(x)).reset_index()
        
        evo_velo_matrix = pd.DataFrame(0,index=data_sample_clonotype["barcode"], columns=data_sample_clonotype["barcode"])

        for i in range(data_sample_clonotype.shape[0]):
            for j in range(i+1,data_sample_clonotype.shape[0]):
                evo_velo_matrix.iloc[i,j] = compute_evo_velocity(data_sample_clonotype["VDJ_sequence_aa"][i],
                                                            data_sample_clonotype["VDJ_sequence_aa"][j], model=model)

                evo_velo_matrix.iloc[j,i] = -evo_velo_matrix.iloc[i,j]


        output_dict.update({f"{sample}_{clonotype}" : evo_velo_matrix})


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [04:41<00:00,  4.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1277/1277 [01:50<00:00, 11.52it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 607/607 [00:17<00:00, 35.71it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 832/832 [01:18<00:00, 10.66it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1301/1301 [01:22<00:00, 15.81it/s]


In [6]:
x = ["A","B","C","D"]

In [7]:
keys = list(output_dict.keys())