In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import itertools
import networkx as nx
from scipy.stats import poisson

%matplotlib inline

In [2]:
data_pd = pd.read_csv('data/MergedData.csv', index_col=0)
data_pd = data_pd.loc[data_pd['species'] == 'HomoSapiens']
data_pd = data_pd.loc[data_pd['database'] == 'tcr_ab_pairs']

In [3]:
antiindex = np.zeros((1 + data_pd.index[-1]), dtype=int)
for i, x in enumerate(data_pd.index):
    antiindex[x] = i

In [4]:
acdr3 = data_pd['alpha.cdr3'].values
bcdr3 = data_pd['beta.cdr3'].values
acdr3_length = np.array([len(x) for x in acdr3])
bcdr3_length = np.array([len(x) for x in bcdr3])

In [5]:
data = data_pd[['alpha.cdr3', 'beta.cdr3']].values
alpha_ind = [5 < len(data[i, 0]) < 16 for i in range(data.shape[0])]
beta_ind = [5 < len(data[i, 1]) < 16 for i in range(data.shape[0])]
indexes = np.all([alpha_ind, beta_ind], axis=0)
ldata = data[indexes]

In [6]:
auseless_positions = {7: [6],
 8: [2],
 9: [1],
 10: [4, 9],
 11: [2, 4, 10],
 12: [2, 11],
 13: [4],
 14: [4, 13],
 15: [4],
 16: [4, 5, 15],
 17: [4, 6, 16],
 18: [5, 6, 17],
 19: [4]}

buseless_positions = {7: [],
 8: [],
 9: [2],
 10: [4, 9],
 11: [4, 6],
 12: [4, 11],
 13: [4],
 14: [4, 5, 6, 13],
 15: [4, 14],
 16: [4, 15],
 17: [4, 16],
 18: [7, 8, 11, 17],
 19: []}

In [7]:
acdr3f = np.zeros((acdr3.shape[0]), dtype=object)
for i in range(acdr3.shape[0]):
    arr = np.array(list(acdr3[i]))
    if len(arr) in auseless_positions.keys():
        arr[auseless_positions[len(arr)]] = 'X'
    acdr3f[i] = ''.join(arr)
    
bcdr3f = np.zeros((bcdr3.shape[0]), dtype=object)
for i in range(bcdr3.shape[0]):
    arr = np.array(list(bcdr3[i]))
    if len(arr) in buseless_positions.keys():
        arr[buseless_positions[len(arr)]] = 'X'
    bcdr3f[i] = ''.join(arr)

acdr3 = acdr3f
bcdr3 = bcdr3f

In [8]:
def aapClusters(sequences, indexing, mmm=1, delimeter='*'):
    l = len(sequences[0])
    clusters = []
    masks = itertools.combinations(np.arange(l), mmm)
    for mask in masks:
        mask = [-1] + list(mask) + [l]
        masker = lambda x: delimeter.join([x[mask[i] + 1:mask[i + 1]] for i in range(mmm + 1)])
        factor = set([masker(x) for x in sequences])
        c = {m:[] for m in factor}
        for i, x in enumerate(sequences):
            c[masker(x)].append(indexing[i])
        for m in factor:
            if len(c[m]) > 1:
                clusters.append(c[m])
    return clusters

def EdgeListfromClusters(n, clusters):
    edges = set([])
    for cluster in clusters:
        for x, y in itertools.combinations(cluster, 2):
            edges.add((x, y))
    return list(edges)

In [9]:
ldata = data[indexes]
acdr3 = ldata[:, 0]
bcdr3 = ldata[:, 1]
acdr3_length = np.array([len(x) for x in acdr3])
bcdr3_length = np.array([len(x) for x in bcdr3])

In [10]:
aedges = []
for l in range(6, 16):
    indexes = np.arange(acdr3.shape[0])[acdr3_length == l]
    a = acdr3[indexes]
    clusters = aapClusters(a, indexes)
    e = EdgeListfromClusters(acdr3.shape[0], clusters)
    aedges += e

bedges = []
for l in range(6, 16):
    indexes = np.arange(bcdr3.shape[0])[bcdr3_length == l]
    b = bcdr3[indexes]
    clusters = aapClusters(b, indexes)
    e = EdgeListfromClusters(bcdr3.shape[0], clusters)
    bedges += e

In [11]:
Agraph = nx.Graph()
Agraph.add_edges_from(aedges)
acomponents = list(nx.connected_components(Agraph))

Bgraph = nx.Graph()
Bgraph.add_edges_from(bedges)
bcomponents = list(nx.connected_components(Bgraph))

In [12]:
a_in_cluster = np.zeros((ldata.shape[0]))
for l in acomponents:
    a_in_cluster[list(l)] = 1

In [13]:
b_in_cluster = np.zeros((ldata.shape[0]))
for l in bcomponents:
    b_in_cluster[list(l)] = 1

In [21]:
sum(a_in_cluster), sum(b_in_cluster), ldata.shape[0]

(77647.0, 55273.0, 104461)

In [58]:
train_min_size = ldata.shape[0]  // 2
i = 0
sa = 0
aperm = np.random.permutation(len(acomponents), )
while sa < train_min_size:
    sa += len(acomponents[aperm[i]])
    i += 1
    
train_min_size = ldata.shape[0] // 2
j = 0
sb = 0
bperm = np.random.permutation(len(bcomponents))
while sb < train_min_size:
    sb += len(bcomponents[j])
    j += 1
i, j

(3057, 5980)

In [59]:
aindex_thr = i
bindex_thr = j

In [60]:
train_aindex = np.zeros((ldata.shape[0]))
train_bindex = np.zeros((ldata.shape[0]))

for i in range(aindex_thr):
    train_aindex[list(acomponents[perm[i]])] = 1
for i in range(bindex_thr):
    train_bindex[list(bcomponents[i])] = 1

In [27]:
train_bindex = 1 - train_bindex

In [61]:
print(np.sum(train_aindex * train_bindex > 0),
      np.sum(train_aindex * (1-train_bindex) > 0),
      np.sum((1-train_aindex) * train_bindex > 0),
      np.sum((1-train_aindex) * (1-train_bindex) > 0))

28242 28476 23989 23754


In [45]:
train_index = np.array((1 - train_aindex) * (1 - train_bindex), dtype=bool)
test_index = np.array(train_aindex * train_bindex, dtype=bool)

In [30]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20)

In [31]:
aa = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', '_']
aa2num_table = {aa[i]:i for i in range(21)}

def aa2num(aa):
    return aa2num_table[aa]

def protein2array(protein):
    return np.array([aa2num(aa) for aa in protein])

In [35]:
def standartize_cdr3_length(protein):
    insertion_places = {6:3, 7:4, 8:4, 9:5, 10:5, 11:6, 12:6, 13:7, 14:7, 15:8}
    insertion_place = insertion_places[len(protein)]
    insertion_length = 15 - len(protein)
    return protein[:insertion_place] + '_' * insertion_length + protein[insertion_place:]

In [38]:
for i in range(ldata.shape[0]):
    ldata[i, 0] = standartize_cdr3_length(ldata[i, 0])
    ldata[i, 1] = standartize_cdr3_length(ldata[i, 1])

In [47]:
for i in range(ldata.shape[0]):
    ldata[i, 0] = protein2array(ldata[i, 0])
    ldata[i, 1] = protein2array(ldata[i, 1])

In [48]:
def GetPairSampling(A, B):
    assert A.shape[0] % 2 == 0
    X = np.zeros((2 * A.shape[0], A.shape[1] + B.shape[1]))
    X[::2, :A.shape[1]] = A
    X[1::2, :A.shape[1]] = A
    X[::2, A.shape[1]:] = B
    X[1::4, A.shape[1]:] = B[1::2]
    X[3::4, A.shape[1]:] = B[::2]
    
    y = np.zeros((2 * A.shape[0]))
    y[::2] = 1
    
    return X, y

In [53]:
A = np.array([list(x) for x in ldata[:, 0]])
B = np.array([list(x) for x in ldata[:, 1]])

In [62]:
# train_index, test_index = train_test_split(np.arange(A.shape[0]))
train_size, test_size = train_index.shape[0], test_index.shape[0]

X_train, y_train = GetPairSampling(A[train_index], B[train_index])
X_test, y_test = GetPairSampling(A[test_index], B[test_index])

rf = RandomForestClassifier(n_estimators=25)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
score = y_pred == y_test

In [63]:
sum(score) / score.shape[0]

0.5009722434039957