In [1]:
import sys
sys.path.append("../")

In [2]:
from mir.common.repertoire import Repertoire
from mir.common.parser import DoubleChainVDJtoolsParser
from mir.common.segments import SegmentLibrary
from mir.embedding.prototype_embedding import PrototypeEmbedding
from mir.distances.aligner import ClonotypeAligner
import pandas as pd

In [3]:
import matplotlib.pyplot as plt

In [4]:
from collections import defaultdict

# Prototype embeddings generation

In [5]:
init_df = pd.read_csv('processed_struct.txt', sep='\t')

In [6]:
init_df = init_df[(init_df.a_v != 'TRAV12D-2*01') & (init_df.a_v != 'TRAV7D-5*01')]
init_df

Unnamed: 0,clone_id,a_v,b_v,a_j,b_j,a_cdr3aa,b_cdr3aa
0,1ao7,TRAV12-2*01,TRBV6-5*01,TRAJ24*01,TRBJ2-7*01,CAVTTDSWGKLQF,CASRPGLAGGRPEQYF
1,1bd2,TRAV29/DV5*01,TRBV6-5*01,TRAJ54*01,TRBJ2-7*01,CAAMEGAQKLVF,CASSYPGGGFYEQYF
2,1fyt,TRAV8-4*01,TRBV28*01,TRAJ48*01,TRBJ1-2*01,CAVSESPFGNEKLTF,CASSSTGLPYGYTF
3,1j8h,TRAV8-4*01,TRBV28*01,TRAJ48*01,TRBJ1-2*01,CAVSESPFGNEKLTF,CASSSTGLPYGYTF
5,1mi5,TRAV26-2*01,TRBV7-8*01,TRAJ52*01,TRBJ2-7*01,CILPLAGGTSYGKLTF,CASSLGQAYEQYF
...,...,...,...,...,...,...,...
211,8VCX,TRAV26-1*01,TRBV5-1*01,TRAJ39*01,TRBJ2-5*01,CIVSHNAGNMLTF,CASSLERETQYF
212,8VCY,TRAV26-1*01,TRBV5-1*01,TRAJ39*01,TRBJ2-5*01,CIVSHNAGNMLTF,CASSLERETQYF
213,8VD0,TRAV26-1*01,TRBV5-1*01,TRAJ42*01,TRBJ1-3*01,CIVRVAIEGSQGNLIF,CASSLRRGDTIYF
214,8VD2,TRAV26-1*01,TRBV5-1*01,TRAJ42*01,TRBJ1-3*01,CIVRVAIEGSQGNLIF,CASSLRRGDTIYF


In [25]:
init_df.isna().sum()

clone_id    0
a_v         0
b_v         0
a_j         0
b_j         1
a_cdr3aa    0
b_cdr3aa    0
dtype: int64

In [27]:
init_df = init_df.dropna()

In [28]:
init_df.to_csv('processed_struct_wo_mice.txt', sep='\t', index=False)

In [29]:
SegmentLibrary.load_default()

Library of 347 segments: [HomoSapiens TRAV10*01:-1:..DSASYICVVS, HomoSapiens TRAV10*02:-1:..DSASYICVVS, HomoSapiens TRAV11*01:-1:..GDSATYFCAL, HomoSapiens TRAV12-1*01:-1:..DSATYLCVVN, HomoSapiens TRAV12-1*02:-1:..DSATYLCVVN, HomoSapiens TRAV12-2*01:-1:..DSATYLCAVN, HomoSapiens TRAV12-2*02:-1:..SDSATYLCAV, HomoSapiens TRAV12-3*01:-1:..DSATYLCAMS, HomoSapiens TRAV12-3*02:-1:..DSATYLCAMS, HomoSapiens TRAV13-1*01:-1:..DSAVYFCAAS]

In [30]:
analysis_repertoire = Repertoire.load(parser=DoubleChainVDJtoolsParser(column_mapping={
                'Va': 'a_v',
                'Ja': 'a_j',
                'cdr3a': 'a_cdr3aa',
                'Vb': 'b_v',
                'Jb': 'b_j',
                'cdr3b': 'b_cdr3aa',
                'mhc.a': 'clone_id'
                }, lib=SegmentLibrary.load_default()),
                path='processed_struct_wo_mice.txt', 
                )

In [31]:
proto_repertoire = Repertoire.load(parser=DoubleChainVDJtoolsParser(column_mapping={
                'Va': 'a_v',
                'Ja': 'a_j',
                'cdr3a': 'a_cdr3aa',
                'Vb': 'b_v',
                'Jb': 'b_j',
                'cdr3b': 'b_cdr3aa',
                'mhc.a': 'clone_id'
                }, lib=SegmentLibrary.load_default()),
                path='processed_struct_wo_mice.txt', 
                )

In [32]:
proto_repertoire

Repertoire of 210 clonotypes and 210 cells:
alpha κ-1 CAVTTDSWGKLQF beta κ-1 CASRPGLAGGRPEQYF
alpha κ-1 CAAMEGAQKLVF beta κ-1 CASSYPGGGFYEQYF
alpha κ-1 CAVSESPFGNEKLTF beta κ-1 CASSSTGLPYGYTF
alpha κ-1 CAVSESPFGNEKLTF beta κ-1 CASSSTGLPYGYTF
alpha κ-1 CILPLAGGTSYGKLTF beta κ-1 CASSLGQAYEQYF
{'path': 'processed_struct_wo_mice.txt'}
...

In [33]:
analysis_repertoire

Repertoire of 210 clonotypes and 210 cells:
alpha κ-1 CAVTTDSWGKLQF beta κ-1 CASRPGLAGGRPEQYF
alpha κ-1 CAAMEGAQKLVF beta κ-1 CASSYPGGGFYEQYF
alpha κ-1 CAVSESPFGNEKLTF beta κ-1 CASSSTGLPYGYTF
alpha κ-1 CAVSESPFGNEKLTF beta κ-1 CASSSTGLPYGYTF
alpha κ-1 CILPLAGGTSYGKLTF beta κ-1 CASSLGQAYEQYF
{'path': 'processed_struct_wo_mice.txt'}
...

In [34]:
analysis_repertoire.total

210

In [35]:
embedding_maker = PrototypeEmbedding(proto_repertoire, aligner = ClonotypeAligner.from_library(lib=SegmentLibrary.load_default()))

In [36]:
import time

In [37]:
t0 = time.time()
embeddings = embedding_maker.embed_repertoire(analysis_repertoire, threads=32, flatten_scores=True)
print(f'finished {analysis_repertoire.total} clones in {time.time() - t0}')

finished 210 clones in 2.8168885707855225


In [39]:
column_names = []
for i in range(proto_repertoire.total):
    column_names += [f'{i}_a_v', f'{i}_a_j', f'{i}_a_cdr3', f'{i}_b_v', f'{i}_b_j', f'{i}_b_cdr3']

In [40]:
embeddings = pd.DataFrame(embeddings, columns=column_names)

In [41]:
embeddings

Unnamed: 0,0_a_v,0_a_j,0_a_cdr3,0_b_v,0_b_j,0_b_cdr3,1_a_v,1_a_j,1_a_cdr3,1_b_v,...,208_a_cdr3,208_b_v,208_b_j,208_b_cdr3,209_a_v,209_a_j,209_a_cdr3,209_b_v,209_b_j,209_b_cdr3
0,479.0,108.0,420.0,505.0,79.0,550.0,211.0,-12.0,-60.0,505.0,...,-40.0,145.0,-2.0,-110.0,174.0,11.0,-80.0,126.0,-25.0,-50.0
1,211.0,-12.0,-20.0,505.0,79.0,10.0,490.0,101.0,300.0,505.0,...,-140.0,145.0,-2.0,20.0,176.0,-23.0,-110.0,126.0,-25.0,-100.0
2,111.0,-24.0,60.0,262.0,-22.0,-130.0,122.0,-16.0,-20.0,262.0,...,-20.0,162.0,-16.0,-80.0,103.0,5.0,-10.0,133.0,-16.0,-50.0
3,111.0,-24.0,60.0,262.0,-22.0,-130.0,122.0,-16.0,-20.0,262.0,...,-20.0,162.0,-16.0,-80.0,103.0,5.0,-10.0,133.0,-16.0,-50.0
4,36.0,4.0,-80.0,122.0,79.0,-30.0,68.0,4.0,-80.0,122.0,...,40.0,217.0,-2.0,30.0,50.0,-2.0,-50.0,182.0,-25.0,-10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,56.0,6.0,-20.0,145.0,-15.0,-140.0,82.0,-6.0,10.0,145.0,...,-130.0,496.0,-4.0,130.0,49.0,-18.0,-140.0,292.0,-12.0,0.0
206,56.0,6.0,-20.0,145.0,-15.0,-140.0,82.0,-6.0,10.0,145.0,...,-130.0,496.0,-4.0,130.0,49.0,-18.0,-140.0,292.0,-12.0,0.0
207,56.0,-4.0,-20.0,145.0,-2.0,-190.0,82.0,-7.0,-40.0,145.0,...,490.0,496.0,107.0,350.0,49.0,-8.0,-50.0,292.0,-8.0,30.0
208,56.0,-4.0,-20.0,145.0,-2.0,-190.0,82.0,-7.0,-40.0,145.0,...,490.0,496.0,107.0,350.0,49.0,-8.0,-50.0,292.0,-8.0,30.0


In [42]:
embeddings.to_csv('prototype_embeddings_struct.csv', index=False)