In [1]:
%matplotlib inline
import pandas as pd

DISTANCE_THRESHOLD = 8.
TABLE_PATH = '../../result/structure.txt'

### 1. Preprocess

Load the data table for further processing

In [2]:
table = pd.DataFrame(pd.read_table(TABLE_PATH, sep='\t'))

Get parts of the table with CDR3 region for each of the chain (alpha/beta)

In [3]:
alpha_CDR3_filter = (table['tcr_region'] == 'CDR3') & (table['tcr_v_allele'].str.startswith('TRA'))
beta_CDR3_filter = (table['tcr_region'] == 'CDR3') & (table['tcr_v_allele'].str.startswith('TRB'))
grouped_alpha = table[alpha_CDR3_filter].groupby('pdb_id')
grouped_beta = table[beta_CDR3_filter].groupby('pdb_id')

pnames = grouped_alpha.groups.keys()

### 2. Get interaction frequency matrix

In [4]:
# get square matrix for CDR3 and Antigen amino acids
# fm - data frame; ex.: gr.get_group('1ao7')
# col - either 'distance' or 'energy'
def get_matrix(fm, col):
    group = fm.reset_index()
    lenantigen = group.loc[0,'len_antigen']
    lentcr = group.loc[0,'len_tcr']

    mat = group.pivot('pos_antigen', 'pos_tcr', col)
    #hmap.columns = pd.MultiIndex.from_tuples(zip(range(lentcr), group['aa_tcr'][::lenantigen]))#
    mat.columns = group['aa_tcr'][::lenantigen]
    #hmap.index = pd.MultiIndex.from_tuples(zip(range(lenantigen), group['aa_antigen'][:lenantigen]))
    mat.index = group['aa_antigen'][:lenantigen]
    return mat

In [5]:
# get square matrix for CDR3 and Antigen amino acid positions
# m - data frame obtained with get_matrix()
# Possible cell values 0.0/0.5/1.0
def get_frequency_mat(m):
    res = m.copy()
    res.columns = range(-m.shape[1]/2 + m.shape[1] % 2, m.shape[1] - m.shape[1]/2)
    res.index = range(-m.shape[0]/2 + m.shape[0] % 2, m.shape[0] - m.shape[0]/2)
    if (m.shape[0] % 2 == 0):
        m_copy = res.copy()
        m_copy.index = map(lambda x: x + 1, res.index)
        res = (res/2.).add(m_copy/2., fill_value=0)
    return res 
        
mat_list = [(get_matrix(grouped_beta.get_group(name), 'distance') < DISTANCE_THRESHOLD) for name in pnames]

mat_list[0] = get_frequency_mat(mat_list[0])
summary_frequency_mat = reduce(lambda x, y: x.add(get_frequency_mat(y), fill_value=0), mat_list).fillna(0)

In [6]:
summary_frequency_mat

Unnamed: 0,-9,-8,-7,-6,-5,-4,-3,-2,-1,0,1,2,3,4,5,6,7,8
-10,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
-9,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
-8,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
-7,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
-6,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0,0
-5,0,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0,0,0
-4,0,0,0.0,0.0,0.0,0.0,1.0,1.0,4.5,8.5,3.5,1.0,0.0,0.0,0.0,0,0,0
-3,0,0,0.0,0.0,0.0,0.0,2.5,4.0,8.0,16.5,10.5,2.5,0.5,0.0,0.0,0,0,0
-2,0,0,0.0,0.0,0.5,0.0,5.0,14.0,22.0,20.5,19.0,7.5,7.0,0.0,1.5,0,0,0
-1,0,0,0.0,0.5,10.0,4.0,22.5,35.0,44.0,50.0,37.0,21.5,17.5,2.0,3.0,0,0,0


In [8]:
summary_frequency_mat.to_csv("frequency_beta.csv", index=False, header=False, sep='\t')