In [346]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from collections import Counter

In [4]:
vdjdb_slim_df = pd.read_csv('../vdjdb-dump/vdjdb.slim.txt', sep='\t')

In [194]:
vdjdb_slim_df.sample(3)

Unnamed: 0,gene,cdr3,species,antigen.epitope,antigen.gene,antigen.species,complex.id,v.segm,j.segm,v.end,j.start,mhc.a,mhc.b,mhc.class,reference.id,vdjdb.score
7497,TRB,CSVENLPEAFF,HomoSapiens,KLGGALQAK,IE1,CMV,13218,TRBV29-1*01,TRBJ1-1*01,4,7,HLA-A*03:01,B2M,MHCI,https://www.10xgenomics.com/resources/applicat...,0
57038,TRB,CSARWRYSRGPDTQYF,HomoSapiens,GILGFVFTL,M,InfluenzaA,0,TRBV20-1*01,TRBJ2-3*01,4,11,HLA-A*02,B2M,MHCI,PMID:28423320,0
60897,TRB,CASSSPRDSAETLYF,MusMusculus,SSYRRPVGI,PB1,InfluenzaA,23121,TRBV19*01,TRBJ2-3*01,4,8,H-2Kb,B2M,MHCI,PMID:28636592,1


In [144]:
def filter_df(df, species, antigen_epitope, tr_chain, cdr3_len):
    return df[(df['species'] == species) &
              (df['antigen.epitope'] == antigen_epitope) &
              (df['gene'] == tr_chain) &
              (df.cdr3.str.len() == cdr3_len) &
              (df['j.start'] > df['v.end']) &
              (df['v.end'] > 0)].copy()

In [238]:
def get_concervative_pos(df, q):
    start_pos = np.percentile(df['v.end'], q=q, interpolation='nearest')
    end_pos = np.percentile(df['j.start'], q=(100-q), interpolation='nearest') - 1
    return start_pos, end_pos

In [483]:
def get_blosum_matrix(df, start_pos, end_pos):
    cdr3_full = df.cdr3.apply(lambda x: pd.Series(list(x))).values
    cdr3 = cdr3_full[:, start_pos:(end_pos+1)]
    acids = np.unique(cdr3)
    acids2idx = dict([(acid, idx) for idx, acid in enumerate(acids)])
    acids_n = acids.shape[0]

    # Frequency table
    F_ij = np.zeros([acids_n, acids_n])
    for j in range(0, cdr3.shape[1]):
        column_acids = cdr3[:, j]
        cnt = Counter(column_acids)
        for i_acid, i in acids2idx.items():
            for j_acid, j in acids2idx.items():
                if (i_acid == j_acid) and cnt[i_acid] > 1:
                    F_ij[i, j] += ((cnt[i_acid] - 1) * (cnt[i_acid] - 2) / 2)
                else:
                    F_ij[i, j] += cnt[i_acid] * cnt[j_acid] * 2
    
    # Observed probability
    rows = cdr3.shape[0]
    columns = cdr3.shape[1]
    Q_ij = F_ij / (columns * rows * (rows-1) / 2)

    # Expected probability
    P_i = np.sum(Q_ij * np.eye(acids_n, dtype=bool), axis=1) # i == j
    UpT = np.triu(np.ones([acids_n, acids_n]))
    np.fill_diagonal(UpT, 0)
    P_i += np.sum(UpT * Q_ij, axis=1) / 2 # i != j, upper triangle
    
    # Expected frequencies
    E_ij = np.zeros([acids_n, acids_n])
    for i in range(0, acids_n):
        for j in range(0, acids_n):
            if i == j:
                E_ij[i, j] = P_i[i] * P_i[i]
            else:
                E_ij[i, j] = 2 * P_i[i] * P_i[j]
    
    # The log-odds ration
    L = Q_ij / E_ij
    L = np.round(np.log2(Q_ij / E_ij) * 2)
    return L, acids

#### Homo Sapiens, GIL antigen

In [236]:
CDR3_LEN = 14
df = filter_df(vdjdb_slim_df, 'HomoSapiens', 'GILGFVFTL', 'TRB', CDR3_LEN)

In [255]:
cdr3_start_pos, cdr3_end_pos = get_concervative_pos(df, 62)
print(f'CDR3 start position: {cdr3_start_pos}, end position: {cdr3_end_pos}')

CDR3 start position: 4, end position: 7


In [241]:
df['v_len_arr'] = df['v.end']
df['cdr3_len_arr'] = df['j.start'] - df['v.end']
df['j_len_arr'] = CDR3_LEN - df['j.start']

In [253]:
plot_sample_n = 100
plot_df = df.sample(plot_sample_n, random_state=42).sort_values(['cdr3_len_arr', 'v_len_arr'])
traces = []
traces.append(go.Bar(x=plot_df.v_len_arr, orientation='h', name='V'))
traces.append(go.Bar(x=plot_df.cdr3_len_arr, orientation='h', name='CDR3'))
traces.append(go.Bar(x=plot_df.j_len_arr, orientation='h', name='J'))
traces.append(go.Scatter(x=[cdr3_start_pos+1, cdr3_start_pos+1], y=[0, plot_sample_n],
                         name='CDR3 start, 80%', line={'color': 'black'}))
traces.append(go.Scatter(x=[cdr3_end_pos+1, cdr3_end_pos+1], y=[0, plot_sample_n],
                         name='CDR3 end, 80%', line={'color': 'black'}))
fig = go.Figure(data=traces)
fig.update_layout(barmode='stack', title='HomoSapiens, GIL, TRB, Len: 14')
fig.show()

In [519]:
L, acids = get_blosum_matrix(df, start_pos, end_pos)
result = pd.DataFrame(L, index=acids, columns=acids)
result


divide by zero encountered in log2



Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
A,-3.0,-5.0,-2.0,-4.0,-1.0,0.0,0.0,-1.0,-1.0,0.0,0.0,-0.0,1.0,1.0,1.0,1.0,4.0,7.0,8.0,6.0
C,-5.0,-inf,-6.0,-5.0,-inf,-6.0,-inf,-8.0,-1.0,-6.0,-0.0,-2.0,-4.0,-3.0,2.0,4.0,1.0,2.0,4.0,-inf
D,-2.0,-6.0,-2.0,-2.0,-1.0,-0.0,1.0,-1.0,-1.0,0.0,0.0,2.0,0.0,3.0,1.0,-2.0,4.0,6.0,7.0,8.0
E,-4.0,-5.0,-2.0,2.0,-4.0,-1.0,-2.0,-1.0,4.0,-3.0,-3.0,-1.0,3.0,-1.0,-1.0,-3.0,6.0,5.0,6.0,6.0
F,-1.0,-inf,-1.0,-4.0,-1.0,-0.0,1.0,2.0,0.0,3.0,3.0,-1.0,3.0,3.0,2.0,-2.0,4.0,7.0,9.0,6.0
G,0.0,-6.0,-0.0,-1.0,-0.0,-0.0,1.0,-0.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,-0.0,5.0,7.0,8.0,7.0
H,0.0,-inf,1.0,-2.0,1.0,1.0,-1.0,1.0,0.0,2.0,2.0,2.0,3.0,3.0,3.0,-1.0,5.0,8.0,9.0,8.0
I,-1.0,-8.0,-1.0,-1.0,2.0,-0.0,1.0,2.0,2.0,4.0,4.0,-2.0,4.0,4.0,2.0,0.0,4.0,8.0,10.0,6.0
K,-1.0,-1.0,-1.0,4.0,0.0,1.0,0.0,2.0,3.0,2.0,2.0,-1.0,5.0,2.0,2.0,0.0,7.0,7.0,9.0,6.0
L,0.0,-6.0,0.0,-3.0,3.0,1.0,2.0,4.0,2.0,2.0,4.0,0.0,5.0,4.0,3.0,-0.0,5.0,8.0,10.0,7.0
