In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import itertools
import networkx as nx

%matplotlib inline

In [2]:
data_pd = pd.read_csv('data/MergedData.csv', index_col=0)
data_pd = data_pd.loc[data_pd['species'] == 'HomoSapiens']
data_pd = data_pd.loc[data_pd['database'] == 'tcr_ab_pairs']
data_pd.index = np.arange(data_pd.shape[0])

In [3]:
acdr3 = data_pd['alpha.cdr3'].as_matrix()
bcdr3 = data_pd['beta.cdr3'].as_matrix()
acdr3_length = np.array([len(x) for x in acdr3])
bcdr3_length = np.array([len(x) for x in bcdr3])

In [4]:
auseless_positions = {7: [6],
 8: [2],
 9: [1],
 10: [4, 9],
 11: [2, 4, 10],
 12: [2, 11],
 13: [4],
 14: [4, 13],
 15: [4],
 16: [4, 5, 15],
 17: [4, 6, 16],
 18: [5, 6, 17],
 19: [4]}

buseless_positions = {7: [],
 8: [],
 9: [2],
 10: [4, 9],
 11: [4, 6],
 12: [4, 11],
 13: [4],
 14: [4, 5, 6, 13],
 15: [4, 14],
 16: [4, 15],
 17: [4, 16],
 18: [7, 8, 11, 17],
 19: []}

In [5]:
acdr3f = np.zeros((acdr3.shape[0]), dtype=object)
for i in range(acdr3.shape[0]):
    arr = np.array(list(acdr3[i]))
    if len(arr) in auseless_positions.keys():
        arr[auseless_positions[len(arr)]] = 'X'
    acdr3f[i] = ''.join(arr)
    
bcdr3f = np.zeros((bcdr3.shape[0]), dtype=object)
for i in range(bcdr3.shape[0]):
    arr = np.array(list(bcdr3[i]))
    if len(arr) in buseless_positions.keys():
        arr[buseless_positions[len(arr)]] = 'X'
    bcdr3f[i] = ''.join(arr)

acdr3 = acdr3f
bcdr3 = bcdr3f

In [6]:
def aapClusters(sequences, indexing, mmm=1, delimeter='*'):
    l = len(sequences[0])
    clusters = []
    masks = itertools.combinations(np.arange(l), mmm)
    for mask in masks:
        mask = [-1] + list(mask) + [l]
        masker = lambda x: delimeter.join([x[mask[i] + 1:mask[i + 1]] for i in range(mmm + 1)])
        factor = set([masker(x) for x in sequences])
        c = {m:[] for m in factor}
        for i, x in enumerate(sequences):
            c[masker(x)].append(indexing[i])
        for m in factor:
            if len(c[m]) > 1:
                clusters.append(c[m])
    return clusters

def EdgeListfromClusters(n, clusters):
    edges = set([])
    for cluster in clusters:
        for x, y in itertools.combinations(cluster, 2):
            edges.add((x, y))
    return list(edges)

In [7]:
aedges = []
for l in range(6, 20):
    indexes = np.arange(acdr3.shape[0])[acdr3_length == l]
    a = acdr3[indexes]
    clusters = aapClusters(a, indexes)
    e = EdgeListfromClusters(acdr3.shape[0], clusters)
    aedges += e

bedges = []
for l in range(6, 20):
    indexes = np.arange(bcdr3.shape[0])[bcdr3_length == l]
    b = bcdr3[indexes]
    clusters = aapClusters(b, indexes)
    e = EdgeListfromClusters(bcdr3.shape[0], clusters)
    bedges += e

In [13]:
aadj = [[] for i in range(acdr3.shape[0])]
badj = [[] for i in range(acdr3.shape[0])]

for i in aedges:
    aadj[i[0]].append(i[1])
    aadj[i[1]].append(i[0])

for i in bedges:
    badj[i[0]].append(i[1])
    badj[i[1]].append(i[0])

In [14]:
superadj = [list(set(aadj[i]) & set(badj[i]))for i in range(acdr3.shape[0])]

In [16]:
s = 0
for i in superadj:
    s += len(i)
s

11870

In [18]:
len(aedges) * len(bedges) / (acdr3.shape[0] ** 2 // 2)

3266.498227705472

Lets look at the dependence of mismatch positions in alpha and beta.

In [19]:
abmmpos = np.zeros((s, 2))

In [20]:
i = 0
for j in superadj[i]:
    acdr3[i]

[84664]

In [22]:
np.array(list(acdr3[0])), np.array(list(acdr3[84664]))

(array(['C', 'A', 'M', 'R', 'X', 'T', 'S', 'N', 'D', 'Y', 'K', 'L', 'S', 'X'],
       dtype='<U1'),
 array(['C', 'A', 'M', 'R', 'X', 'V', 'S', 'N', 'D', 'Y', 'K', 'L', 'S', 'X'],
       dtype='<U1'))