In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import itertools
import networkx as nx

%matplotlib inline

In [64]:
data_pd = pd.read_csv('data/MergedData.csv', index_col=0)
data_pd = data_pd.loc[data_pd['species'] == 'HomoSapiens']
data_pd = data_pd.loc[data_pd['database'] == 'tcr_ab_pairs']
data_pd.index = np.arange(data_pd.shape[0])

In [65]:
acdr3 = data_pd['alpha.cdr3'].as_matrix()
bcdr3 = data_pd['beta.cdr3'].as_matrix()
acdr3_length = np.array([len(x) for x in acdr3])
bcdr3_length = np.array([len(x) for x in bcdr3])

In [6]:
def aapClusters(sequences, indexing, mmm=1, delimeter='*'):
    l = len(sequences[0])
    clusters = []
    masks = itertools.combinations(np.arange(l), mmm)
    for mask in masks:
        mask = [-1] + list(mask) + [l]
        masker = lambda x: delimeter.join([x[mask[i] + 1:mask[i + 1]] for i in range(mmm + 1)])
        factor = set([masker(x) for x in sequences])
        c = {m:[] for m in factor}
        for i, x in enumerate(sequences):
            c[masker(x)].append(indexing[i])
        for m in factor:
            if len(c[m]) > 1:
                clusters.append(c[m])
    return clusters

def EdgeListfromClusters(n, clusters):
    edges = set([])
    for cluster in clusters:
        for x, y in itertools.combinations(cluster, 2):
            edges.add((x, y))
    return list(edges)

In [66]:
aedges = []
for l in range(6, 20):
    indexes = np.arange(acdr3.shape[0])[acdr3_length == l]
    a = acdr3[indexes]
    clusters = aapClusters(a, indexes)
    e = EdgeListfromClusters(acdr3.shape[0], clusters)
    aedges += e

bedges = []
for l in range(6, 20):
    indexes = np.arange(bcdr3.shape[0])[bcdr3_length == l]
    b = bcdr3[indexes]
    clusters = aapClusters(b, indexes)
    e = EdgeListfromClusters(bcdr3.shape[0], clusters)
    bedges += e

In [67]:
Agraph = nx.Graph()
Agraph.add_edges_from(aedges)
acomponents = list(nx.connected_components(Agraph))

Bgraph = nx.Graph()
Bgraph.add_edges_from(bedges)
bcomponents = list(nx.connected_components(Bgraph))

alength = np.array([len(x) for x in acomponents])
acomp = np.array(acomponents)[alength > 100]

blength = np.array([len(x) for x in bcomponents])
bcomp = np.array(bcomponents)[blength > 100]

In [69]:
i2acomp = {}
for i, x in enumerate(acomp):
    for y in x:
        i2acomp[y] = i
i2bcomp = {}
for i, x in enumerate(bcomp):
    for y in x:
        i2bcomp[y] = i

avertexes = set(i2acomp.keys())
bvertexes = set(i2bcomp.keys())
indexes = avertexes & bvertexes

vacomp = np.array([indexes & acomp[i] for i in range(acomp.shape[0])])
vbcomp = np.array([indexes & bcomp[i] for i in range(bcomp.shape[0])])

comp_table = np.zeros((len(acomp), len(bcomp)), dtype=int)
for i in indexes:
    comp_table[i2acomp[i], i2bcomp[i]] += 1

xcompsum, ycompsum = np.sum(comp_table, axis=1), np.sum(comp_table, axis=0)
expected = np.dot(xcompsum.reshape(-1, 1), ycompsum.reshape(1, -1)) / np.sum(comp_table)
zscores = (comp_table - expected) / np.sqrt(expected + 1)
zscore5 = np.array(np.where(zscores > 5)).T
for i, j in zscore5:
    print(i, j, zscores[i, j])

163 5 5.68641385603


In [70]:
a_cluster, b_cluster = zscore5[0]

aind = np.array(list(vacomp[a_cluster]))
bind = np.array(list(vbcomp[b_cluster]))
pind = np.array(list(vacomp[a_cluster] & vbcomp[b_cluster]))
print('alpha chain={}\n'.format(aind.shape[0]), 
      'beta chain={}\n'.format(bind.shape[0]), 
      'Paired={}\n'.format(len(acomp[a_cluster] & bcomp[b_cluster])))
data_pd.loc[pind]

alpha chain=291
 beta chain=130
 Paired=10



Unnamed: 0,database,species,sample,epitope,antigen,tissue,cell_subset,alpha.v,alpha.j,alpha.cdr3,beta.v,beta.d,beta.j,beta.cdr3
12865,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV21,TRAJ45,CAVRLSGGGADGLTF,TRBV20-1,,TRBJ2-7,CSARPGGYEQYF
99974,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGQSGGGADGLTF,TRBV20-1,TRBD2,TRBJ2-5,CSARVAGETQYF
105550,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGRSGGGADGLTF,TRBV20-1,TRBD2,TRBJ2-3,CSARASGTTQYF
33070,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGRSGGGADGLTF,TRBV20-1,TRBD1,TRBJ2-7,CSARTTGYEQYF
14810,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV21,TRAJ45,CAVRAAGGGADGLTF,TRBV20-1,TRBD1,TRBJ2-1,CSARPGGYEQFF
4852,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGSSGGGADGLTF,TRBV20-1,,TRBJ2-5,CSARVSGETQYF
73397,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV21,TRAJ45,CAVRRPGGGADGLTF,TRBV20-1,TRBD1,TRBJ2-7,CSARPGGYEQYF
8982,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGSSGGGADGLTF,TRBV20-1,,TRBJ2-5,CSARWAGETQYF
58778,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGRSGGGADGLTF,TRBV20-1,TRBD2,TRBJ2-5,CSARLAGETQYF
115963,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGGSGGGADGLTF,TRBV20-1,,TRBJ2-5,CSARQSGETQYF


What we get with all databases:

The epitope-related clusters dominate.

In [51]:
a_cluster, b_cluster = zscore5[1]

aind = np.array(list(vacomp[a_cluster]))
bind = np.array(list(vbcomp[b_cluster]))
pind = np.array(list(vacomp[a_cluster] & vbcomp[b_cluster]))
print('alpha chain={}\n'.format(aind.shape[0]), 
      'beta chain={}\n'.format(bind.shape[0]), 
      'Paired={}\n'.format(len(acomp[a_cluster] & bcomp[b_cluster])))
data_pd.loc[pind]

alpha chain=335
 beta chain=331
 Paired=17



Unnamed: 0,database,species,sample,epitope,antigen,tissue,cell_subset,alpha.v,alpha.j,alpha.cdr3,beta.v,beta.d,beta.j,beta.cdr3
85059,"vdjdb_ab,vdjdb",HomoSapiens,,GILGFVFTL,M1,PBMC,CD8+,TRAV27*01,TRAJ42*01,CAGAGGGSQGNLIF,TRBV19*01,,TRBJ2-2*01,CASSSRATGELFF
85540,"vdjdb_ab,vdjdb",HomoSapiens,,GILGFVFTL,M1,PBMC,CD8+,TRAV27*01,TRAJ42*01,CAGGDGGSQGNLIF,TRBV19*02,,TRBJ2-2*01,CASSIRSTGELFF
157636,"vdjdb_ab,vdjdb",HomoSapiens,,GILGFVFTL,,PBMC,,TRAV27*01,,CAGAGGGSQGNLIF,TRBV19*01,,,CASSSRSTGELFF
145347,"vdjdb_ab,vdjdb",HomoSapiens,,GILGFVFTL,,PBMC,,TRAV27*01,,CAGADGGSQGNLIF,TRBV19*01,,,CASSIRSTGELFF
70343,"vdjdb_ab,vdjdb",HomoSapiens,,GILGFVFTL,M1,PBMC,CD8+,TRAV27*01,TRAJ42*01,CAGAEGGSQGNLIF,TRBV19*01,,TRBJ2-2*01,CASSIRSTGELFF
34475,"vdjdb_ab,vdjdb",HomoSapiens,,GILGFVFTL,M1,PBMC,CD8+,TRAV27*01,TRAJ42*01,CAGAEGGSQGNLIF,TRBV19*01,,TRBJ2-2*01,CASSGRSTGELFF
122637,"vdjdb_ab,vdjdb",HomoSapiens,,GILGFVFTL,M1,PBMC,CD8+,TRAV27*01,TRAJ42*01,CAGAGGGSQGNLIF,TRBV19*01,,TRBJ2-2*01,CASSKRSTGELFF
111726,"vdjdb_ab,vdjdb",HomoSapiens,,GILGFVFTL,M1,PBMC,CD8+,TRAV27*01,TRAJ42*01,CAGADGGSQGNLIF,TRBV19*01,,TRBJ2-2*01,CASSSRSAGELFF
141231,"vdjdb_ab,vdjdb",HomoSapiens,,GILGFVFTL,M1,PBMC,CD8+,TRAV27*01,TRAJ42*01,CAGAYGGSQGNLIF,TRBV19*02,,TRBJ2-2*01,CASSIRSTGELFF
67281,tcr_ab_pairs,HomoSapiens,3.0,,,,,TRAV8-3,TRAJ42,CAVDAAGSQGNLIF,"TRBV12-4,TRBV12-3","TRBD1,TRBD2",TRBJ2-2,CASNRDTTGELFF


In [52]:
a_cluster, b_cluster = zscore5[2]

aind = np.array(list(vacomp[a_cluster]))
bind = np.array(list(vbcomp[b_cluster]))
pind = np.array(list(vacomp[a_cluster] & vbcomp[b_cluster]))
print('alpha chain={}\n'.format(aind.shape[0]), 
      'beta chain={}\n'.format(bind.shape[0]), 
      'Paired={}\n'.format(len(acomp[a_cluster] & bcomp[b_cluster])))
data_pd.loc[pind]

alpha chain=84
 beta chain=522
 Paired=10



Unnamed: 0,database,species,sample,epitope,antigen,tissue,cell_subset,alpha.v,alpha.j,alpha.cdr3,beta.v,beta.d,beta.j,beta.cdr3
31843,"vdjdb_ab,vdjdb",HomoSapiens,,NLVPMVATV,,PBMC,CD8+,TRAV25*01,TRAJ50*01,CAGPMKTSYDKVIF,TRBV11-1*01,,TRBJ1-1*01,CASSSAHYGYTF
111815,"vdjdb_ab,vdjdb",HomoSapiens,,NLVPMVATV,,PBMC,CD8+,TRAV25*01,TRAJ50*01,CAGPMKTSYDKVIF,TRBV11-1*01,,TRBJ1-1*01,CASSSAFYGYTF
154631,"vdjdb_ab,vdjdb",HomoSapiens,,"NLVPMVATV,GILGFVFTL",M1,"PBMC,TCL culture",CD8+,TRAV35*01,TRAJ50*01,CAGPMKTSYDKVIF,TRBV12-4*01,,TRBJ1-2*01,CASSSANYGYTF
127083,"vdjdb_ab,vdjdb",HomoSapiens,,NLVPMVATV,pp65,PBMC,CD8+,TRAV35*01,TRAJ50*01,CAGPRETSYDKVIF,TRBV12-4*01,,TRBJ1-2*01,CASSSAYYGYTF
78093,"vdjdb_ab,vdjdb",HomoSapiens,,NLVPMVATV,,PBMC,CD8+,TRAV25*01,TRAJ50*01,CAGPRKTSYDKVIF,TRBV11-1*01,,TRBJ1-1*01,CASSSANYGYTF
10062,"vdjdb_ab,vdjdb",HomoSapiens,,NLVPMVATV,pp65,PBMC,CD8+,TRAV35*01,TRAJ50*01,CAGPMKTSYDKVIF,TRBV12-4*01,,TRBJ1-2*01,CASASANYGYTF
150063,"vdjdb_ab,vdjdb",HomoSapiens,,NLVPMVATV,,PBMC,CD8+,TRAV25*01,TRAJ50*01,CAGPRQTSYDKVIF,TRBV11-1*01,,TRBJ1-1*01,CASSSANYGYTF
38385,"vdjdb_ab,vdjdb",HomoSapiens,,NLVPMVATV,,PBMC,CD8+,TRAV25*01,TRAJ50*01,CAGPEKTSYDKVIF,TRBV11-1*01,,TRBJ1-1*01,CASSSANYGYTF
15098,"vdjdb_ab,vdjdb",HomoSapiens,,NLVPMVATV,pp65,PBMC,CD8+,TRAV35*01,TRAJ50*01,CAGPRETSYDKVIF,TRBV12-4*01,,TRBJ1-2*01,CASASANYGYTF
110108,tcr_ab_pairs,HomoSapiens,3.0,,,,,TRAV29DV5,TRAJ50,CAASQKTSYDKVIF,TRBV27,TRBD2,TRBJ1-2,CASSLGGYGYTF


In [53]:
a_cluster, b_cluster = zscore5[3]

aind = np.array(list(vacomp[a_cluster]))
bind = np.array(list(vbcomp[b_cluster]))
pind = np.array(list(vacomp[a_cluster] & vbcomp[b_cluster]))
print('alpha chain={}\n'.format(aind.shape[0]), 
      'beta chain={}\n'.format(bind.shape[0]), 
      'Paired={}\n'.format(len(acomp[a_cluster] & bcomp[b_cluster])))
data_pd.loc[pind]

alpha chain=301
 beta chain=131
 Paired=10



Unnamed: 0,database,species,sample,epitope,antigen,tissue,cell_subset,alpha.v,alpha.j,alpha.cdr3,beta.v,beta.d,beta.j,beta.cdr3
117475,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGGSGGGADGLTF,TRBV20-1,,TRBJ2-5,CSARQSGETQYF
106915,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGRSGGGADGLTF,TRBV20-1,TRBD2,TRBJ2-3,CSARASGTTQYF
74344,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV21,TRAJ45,CAVRRPGGGADGLTF,TRBV20-1,TRBD1,TRBJ2-7,CSARPGGYEQYF
13034,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV21,TRAJ45,CAVRLSGGGADGLTF,TRBV20-1,,TRBJ2-7,CSARPGGYEQYF
33486,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGRSGGGADGLTF,TRBV20-1,TRBD1,TRBJ2-7,CSARTTGYEQYF
9102,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGSSGGGADGLTF,TRBV20-1,,TRBJ2-5,CSARWAGETQYF
4917,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGSSGGGADGLTF,TRBV20-1,,TRBJ2-5,CSARVSGETQYF
14998,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV21,TRAJ45,CAVRAAGGGADGLTF,TRBV20-1,TRBD1,TRBJ2-1,CSARPGGYEQFF
101244,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGQSGGGADGLTF,TRBV20-1,TRBD2,TRBJ2-5,CSARVAGETQYF
59551,tcr_ab_pairs,HomoSapiens,3,,,,,TRAV39,TRAJ45,CAVGRSGGGADGLTF,TRBV20-1,TRBD2,TRBJ2-5,CSARLAGETQYF
