In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd 
import scanpy as sc 
import torch

adata = sc.read_h5ad('/ix/djishnu/Jane/SLIDESWING/jing_data/KIR+TEDDY/data/KIR+TEDDY_filtered85.h5ad')
adata

AnnData object with n_obs × n_vars = 11500 × 36601
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'nCount_Protein', 'nFeature_Protein', 'nCount_HTO', 'nFeature_HTO', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'sample', 'percent.mt', 'cdr3_b', 'va_gene', 'ja_gene', 'cdr3_a', 'renamed_clusters', 'vb_gene', 'jb_gene', 'integrated_snn_res.0.3', 'cell_type', 'count', 'expansion', 'ID', 'timepoint', 'tp_indicator', 'tp_group', 'tp_single', 'group'
    var: 'features'
    obsm: 'X_umap'

In [3]:
adata.obs[['cdr3_a', 'cdr3_b', 'nFeature_Protein']]

Unnamed: 0,cdr3_a,cdr3_b,nFeature_Protein
AAACCTGAGGTGCTTT-1_1,CAMREPFTDKLIF,CASSVDVPGPPMMGYTF,38
AAAGATGCATACTCTT-1_1,,CASSLIRGSNEQFF,38
AAAGATGGTGGCTCCA-1_1,CAAGCLYGGSQGNLIF,CASSPGTGYYGYTF,37
AAAGATGTCCAAGTAC-1_1,,CASSQVQDRTGGPGQIGVFGELFF,38
AAAGCAACAAATACAG-1_1,,,38
...,...,...,...
TTTGGTTGTCCATCCT-1_9,CAFMGYNNNDMRF,CASSYRGTGELFF,38
TTTGTCAAGGGATACC-1_9,,CASSLRGIGELFF,38
TTTGTCAAGTTACCCA-1_9,CAFMGYNNNDMRF,CASSLRGPGELFF,38
TTTGTCAGTAGCCTAT-1_9,CAPKQTGANNLFF,CASGGGTLQPQHF,38


In [5]:
def tokenize_kmers(tcr, k=2):
    tokens = []
    for i in range(0, len(tcr) - k + 1):
        tokens.append(tcr[i:i+k])
    return tokens

tokenize_kmers('CAFMGYNNNDMRF')

['CA', 'AF', 'FM', 'MG', 'GY', 'YN', 'NN', 'NN', 'ND', 'DM', 'MR', 'RF']

In [6]:
import itertools

amino_acids = np.unique(list(itertools.chain.from_iterable(adata.obs['cdr3_a'].values)))

def create_lookup_table(amino_acids, k=2, token_size=8):
    pairs = [''.join(p) for p in itertools.product(amino_acids, repeat=k)]
    pairs = np.array(pairs)
    lookup_table = np.random.rand(pairs.shape[0], token_size)
    lookup_df = pd.DataFrame(lookup_table, index=pairs)
    return lookup_df

lookup_df = create_lookup_table(amino_acids)
lookup_df

Unnamed: 0,0,1,2,3,4,5,6,7
AA,0.452211,0.897766,0.296184,0.956870,0.551719,0.665739,0.137623,0.777882
AC,0.489241,0.274413,0.218635,0.405953,0.674271,0.946989,0.330692,0.413902
AD,0.681595,0.194633,0.511418,0.526199,0.285462,0.643736,0.791053,0.499179
AE,0.336490,0.879769,0.980844,0.841660,0.980381,0.884293,0.226943,0.042929
AF,0.732887,0.210815,0.931954,0.986636,0.204144,0.760267,0.899688,0.867853
...,...,...,...,...,...,...,...,...
YS,0.313625,0.321959,0.862532,0.890529,0.112351,0.866669,0.575101,0.950839
YT,0.690147,0.341925,0.023581,0.604527,0.752931,0.717796,0.895300,0.470386
YV,0.078893,0.049915,0.586552,0.978205,0.792875,0.554890,0.461505,0.151695
YW,0.378826,0.215704,0.797652,0.633763,0.454193,0.081520,0.457264,0.969015


In [21]:
import sys 
sys.path.append('../src')
from kmer import Kmerizer

mermaid = Kmerizer(k=2, token_size=8)

In [26]:
token = mermaid.encode('CASSQVQDRTGGPGQIGVFGELFF', max_len=50)
token.shape

(50, 8)

In [27]:
tcr_embeddings = mermaid.encode_batch(adata.obs['cdr3_a'].values)
tcr_embeddings.shape

(11500, 30, 8)

In [31]:
np.save('../data/2mer_embeddings.npy', tcr_embeddings)

In [29]:
adata.obs.shape

(11500, 32)