Here I define two functions, dissimilarity_index() and dissimilarity_index_blosum(), to calculate the dissimilarity index within a given sample of CDR3 beta chain receptor sequences. 
The former is based on the Levenshtein distance only and was used in Bravi et al., PLoS Comput. Biol. 2021; the latter used BLOSUM62 matrix to weight substitutions and was used in Luksza et al., Nature 2022. 

In [None]:
rootf = FOLDER ## place here the folder where you save PGM, Align_utils and the other files inside rbm_tcell, for example /user/rbm_tcell ##

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

curr_float = np.float32
curr_int = np.int16

def convert_number(seqs): # convert to numbers already aligned seqs
    aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V',  'W', 'Y','-']
    aadict = {aa[k]: k for k in range(len(aa))}
    msa_num = np.array(list(map(lambda x: [aadict[y] for y in x], seqs[0:])), dtype=curr_int, order="c")
    
    return msa_num

def convert_letter(seqs_n): # convert to numbers already aligned seqs
    aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V',  'W', 'Y','-']
    aadictinv = {k: aa[k] for k in range(len(aa))} 
    seqs=[]
    if type(seqs_n[0]) == curr_int:
        seqs.append(''.join([aadictinv[e] for e in seqs_n]))
    else:
        for t in range(len(seqs_n)):
            seqs.append(''.join([aadictinv[e] for e in seqs_n[t]]))
    return seqs

def flatten_list(listoflist):
    listoflist_fl = [];
    for l in range(len(listoflist)):
        for u in range(len(listoflist[l])):
            listoflist_fl.append(listoflist[l][u])
    return listoflist_fl

import editdistance
from math import log, exp

# Plots stuff
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib import patches
from pandas.plotting import table
mpl.rcParams['font.family'] = ['Garuda']
mpl.rcParams['font.serif'] = ['Garuda-Oblique']

In [None]:
from weighted_levenshtein import lev 
## Need to import the weighted levenshtein package from https://github.com/infoscout/weighted-levenshtein/tree/master/weighted_levenshtein

from Bio.Blast import NCBIXML
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo as matlist
matrix = matlist.blosum62

## substitute costs by the blosum matrix ##
substitute_costs = np.ones((128, 128), dtype=np.float64)  # make a 2D array of 1's
aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V',  'W', 'Y']
costs=[]
for A in aa:
    for B in aa:
        if (A,B) in matrix.keys():
            cost = matrix[(A, B)]
        else:
            cost = matrix[(B, A)]
            
        substitute_costs[ord(A), ord(B)] = 22 - cost - 11

costL = (np.max(substitute_costs) + 1) ## larger than the substitution costs ##
insert_costs = np.ones(128, dtype=np.float64)
delete_costs = np.ones(128, dtype=np.float64)
for A in aa:
    insert_costs[ord(A)] = costL
    delete_costs[ord(A)] = costL

def lev_weighted_AA(seq1,seq2):
    dist = lev(seq1,seq2, substitute_costs = substitute_costs)
    return dist

In [None]:
def dissimilarity_index(seq, Ntop):
    ## Version used in Bravi et al., PLoS Comput. Biol. 2021 ##
    overlap_sum = 0.0
    total_sum = 0.0
    for r in range(Ntop):
        for s in range(r+1,Ntop):
            ss1 = seq[r]
            ss2 = seq[s]
            dist_sdev = 5.7
            dist = editdistance.eval(ss1, ss2) ## distance based on Levenstein only - with top 25, see TCRdiv_norm
            #dist_sdev = 9.37
            #dist = lev_weighted_AA(ss1, ss2)  ## distance based on Levenstein weighted - with top 25, see TCRdiv_blosum
            overlap_sum += exp(-1.0*(dist/dist_sdev)**2)
            total_sum += 1.0

    if total_sum:
        p0 = overlap_sum / total_sum
        dissimilarity = 1.0/p0
    else:
        p0 = 0.0
        dissimilarity = 0.0
        
    return dissimilarity

In [None]:
def dissimilarity_index_blosum(seq, Ntop):
    ## Version used in Luksza et al., Nature 2022 ##
    overlap_sum = 0.0
    total_sum = 0.0
    for r in range(Ntop):
        for s in range(r+1,Ntop):
            ss1 = seq[r]
            ss2 = seq[s]
           
            dist_sdev = 9.37
            dist = lev_weighted_AA(ss1, ss2)
            overlap_sum += exp(-1.0*(dist/dist_sdev)**2)
            total_sum += 1.0

    if total_sum:
        p0 = overlap_sum / total_sum
        dissimilarity = 1.0/p0
    else:
        p0 = 0.0
        dissimilarity = 0.0
        
    return dissimilarity

In [None]:
## Upload the top scoring sequences of the example dataset and estimate 
## the CDR3 dissimilarity index

file_name = 'example_file' 
path_in = rootf  + '/Example'
name_f = path_in + '/' + file_name + '_post_top100.txt'
seqs=[]
with open(name_f) as f:
    for line in f:
        linesplit = line.strip().split('\t')
        seqs.append(linesplit[0])

Ntop=len(seqs)

print(dissimilarity_index_blosum(seqs, Ntop))
print(dissimilarity_index(seqs, Ntop))