In [1]:
import numpy as np
import pandas as pd
from scipy.stats import entropy
import scipy.stats
import seaborn as sns
import sklearn.neighbors
import matplotlib.pyplot as plt
%matplotlib inline

import Levenshtein

import sys
sys.path.append('..')

from lib import *

In [2]:
df_t = load_iedb_tcellepitopes(human_only=True, positive_only=True)

In [3]:
k = 9
counter9 = count_kmers_proteome(human, k, clean=True)

In [4]:
human9 = set(counter9)

In [5]:
# no human epitopes
mask = ~df_t['Epitope', 'Parent Species'].str.contains('Homo sapiens', na=False)
# no epitopes of unknown provenance
mask &= ~df_t['Epitope', 'Parent Species'].isna()
# only epitopes of length 9
mask &= df_t['Epitope', 'Description'].apply(len)==9
d = df_t[mask]

In [6]:
d['1st in vivo Process', 'Process Type'].unique()

array(['Occurrence of infectious disease', 'Administration in vivo',
       'Exposure with existing immune reactivity without evidence for disease',
       'Exposure without evidence for disease',
       'Environmental exposure to endemic/ubiquitous agent without evidence for disease',
       'No immunization', 'Unknown',
       'Documented exposure without evidence for disease',
       'Occurrence of cancer', nan, 'Occurrence of autoimmune disease',
       'Occurrence of allergy', 'Occurrence of disease',
       'Transplant/transfusion'], dtype=object)

In [6]:
d0 = d[d['Epitope', 'Description'].apply(lambda x: x in human9)]
d0

Unnamed: 0_level_0,Reference,Reference,Reference,Reference,Reference,Reference,Reference,Reference,Reference,Epitope,...,Assay Antigen,Assay Antigen,Assay Antigen,Assay Antigen,Assay Antigen,Assay Antigen,Assay Antigen,Assay Antigen,Assay Antigen,Assay Comments
Unnamed: 0_level_1,Assay IRI,Reference IRI,Type,PubMed ID,Authors,Journal,Date,Title,Submission ID,Epitope IRI,...,Non-peptidic Antigen IRI,Antigen Source Molecule Name,Antigen Source Molecule IRI,Protein Parent Name,Protein Parent IRI,Antigen Organism Name,Antigen Organism IRI,Organism Species Name,Organism Species IRI,Assay Comments
71222,http://www.iedb.org/assay/1420885,http://www.iedb.org/reference/1004613,Literature,12960383.0,Gabriella Pietra; Chiara Romagnani; Paola Mazz...,Proc Natl Acad Sci U S A,2003,HLA-E-restricted recognition of cytomegaloviru...,,http://www.iedb.org/epitope/69922,...,,UL40,http://www.ncbi.nlm.nih.gov/protein/AAS48945.1,Protein UL40,http://www.uniprot.org/uniprot/Q6SW92,Human betaherpesvirus 5,http://purl.obolibrary.org/obo/NCBITaxon_10359,Human betaherpesvirus 5,http://purl.obolibrary.org/obo/NCBITaxon_10359,The epitope specific CD8+ T cell clones GF2.1 ...
71224,http://www.iedb.org/assay/1420887,http://www.iedb.org/reference/1004613,Literature,12960383.0,Gabriella Pietra; Chiara Romagnani; Paola Mazz...,Proc Natl Acad Sci U S A,2003,HLA-E-restricted recognition of cytomegaloviru...,,http://www.iedb.org/epitope/69922,...,,UL40,http://www.ncbi.nlm.nih.gov/protein/AAS48945.1,Protein UL40,http://www.uniprot.org/uniprot/Q6SW92,Human betaherpesvirus 5,http://purl.obolibrary.org/obo/NCBITaxon_10359,Human betaherpesvirus 5,http://purl.obolibrary.org/obo/NCBITaxon_10359,The epitope specific CD8+ T cell clone GF2.1 i...
71261,http://www.iedb.org/assay/1420962,http://www.iedb.org/reference/1004613,Literature,12960383.0,Gabriella Pietra; Chiara Romagnani; Paola Mazz...,Proc Natl Acad Sci U S A,2003,HLA-E-restricted recognition of cytomegaloviru...,,http://www.iedb.org/epitope/69922,...,,UL40,http://www.ncbi.nlm.nih.gov/protein/AAS48945.1,Protein UL40,http://www.uniprot.org/uniprot/Q6SW92,Human betaherpesvirus 5,http://purl.obolibrary.org/obo/NCBITaxon_10359,Human betaherpesvirus 5,http://purl.obolibrary.org/obo/NCBITaxon_10359,
79301,http://www.iedb.org/assay/1472651,http://www.iedb.org/reference/1006541,Literature,7520213.0,S S Witkin; J Jeremias; M Toth; W J Ledger,Am J Obstet Gynecol,1994,Proliferative response to conserved epitopes o...,,http://www.iedb.org/epitope/5394,...,,groE,http://www.ncbi.nlm.nih.gov/protein/AAA23128.1,60 kDa chaperonin,http://www.uniprot.org/uniprot/P0C0Z7,Chlamydia trachomatis,http://purl.obolibrary.org/obo/NCBITaxon_813,Chlamydia trachomatis,http://purl.obolibrary.org/obo/NCBITaxon_813,PBMC from four of ten patients experiencing tw...
108738,http://www.iedb.org/assay/1645537,http://www.iedb.org/reference/1014513,Literature,18198358.0,Hubert Tsui; Yin Chan; Lan Tang; Shawn Winer; ...,Diabetes,2008,Targeting of pancreatic glia in type 1 diabetes.,,http://www.iedb.org/epitope/107000,...,,glial fibrillary acidic protein,http://www.ncbi.nlm.nih.gov/protein/EDL34160.1,Glial fibrillary acidic protein,http://www.uniprot.org/uniprot/P03995,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,Epitope elicited a T-cell response that was si...
116420,http://www.iedb.org/assay/1664514,http://www.iedb.org/reference/1007092,Literature,1281825.0,J M Davies; S Sonoda; S Yashiki; M Osame; P R ...,J Neuroimmunol,1992,Mimicry between HTLV-I and myelin basic protei...,,http://www.iedb.org/epitope/113416,...,,Myelin basic protein,https://ontology.iedb.org/ontology/ONTIE_0002407,Myelin basic protein,http://www.uniprot.org/uniprot/P25188,Cavia porcellus,http://purl.obolibrary.org/obo/NCBITaxon_10141,Cavia porcellus,http://purl.obolibrary.org/obo/NCBITaxon_10141,
199545,http://www.iedb.org/assay/1962919,http://www.iedb.org/reference/1025272,Literature,23155466.0,Wendy W J Unger; Todd Pearson; Joana R F Abreu...,PLoS One,2012,Islet-specific CTL cloned from a type 1 diabet...,,http://www.iedb.org/epitope/103705,...,,"glucose-6-phosphatase, catalytic, related",http://www.ncbi.nlm.nih.gov/protein/NP_067306.1,Glucose-6-phosphatase 2,http://www.uniprot.org/uniprot/Q9Z186,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,"Using A2/epitope tetramers and CD8 antibodies,..."
199546,http://www.iedb.org/assay/1962920,http://www.iedb.org/reference/1025272,Literature,23155466.0,Wendy W J Unger; Todd Pearson; Joana R F Abreu...,PLoS One,2012,Islet-specific CTL cloned from a type 1 diabet...,,http://www.iedb.org/epitope/103705,...,,"glucose-6-phosphatase, catalytic, related",http://www.ncbi.nlm.nih.gov/protein/NP_067306.1,Glucose-6-phosphatase 2,http://www.uniprot.org/uniprot/Q9Z186,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,"Using A2/epitope tetramers and CD8 antibodies,..."
199547,http://www.iedb.org/assay/1962921,http://www.iedb.org/reference/1025272,Literature,23155466.0,Wendy W J Unger; Todd Pearson; Joana R F Abreu...,PLoS One,2012,Islet-specific CTL cloned from a type 1 diabet...,,http://www.iedb.org/epitope/103705,...,,"glucose-6-phosphatase, catalytic, related",http://www.ncbi.nlm.nih.gov/protein/NP_067306.1,Glucose-6-phosphatase 2,http://www.uniprot.org/uniprot/Q9Z186,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,Epitope-specific clone 7 produced IFNg in resp...
199549,http://www.iedb.org/assay/1962923,http://www.iedb.org/reference/1025272,Literature,23155466.0,Wendy W J Unger; Todd Pearson; Joana R F Abreu...,PLoS One,2012,Islet-specific CTL cloned from a type 1 diabet...,,http://www.iedb.org/epitope/103705,...,,"glucose-6-phosphatase, catalytic, related",http://www.ncbi.nlm.nih.gov/protein/NP_067306.1,Glucose-6-phosphatase 2,http://www.uniprot.org/uniprot/Q9Z186,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,Mus musculus,http://purl.obolibrary.org/obo/NCBITaxon_10090,Epitope-specific clone 7 produced granzyme B i...


In [7]:
d0['Epitope', 'Parent Species'].unique()

array(['Human betaherpesvirus 5', 'Chlamydia trachomatis', 'Mus musculus',
       'Cavia porcellus', 'Mycobacterium tuberculosis',
       'Leishmania major', 'Dengue virus'], dtype=object)

In [9]:
p = len(human9)/20**9
d0.shape[0], p*d.shape[0]

(25, 0.2294887638125)

In [10]:
def dist1(x):
    for i in range(len(x)):
        for aa in aminoacids:
            if x[:i]+aa+x[i+1:] in human9:
                return True
    return False
d1 = d[d['Epitope', 'Description'].apply(dist1)]

In [12]:
d1.shape[0]-d0.shape[0], p*d.shape[0]*19*9

(260, 39.242578611937496)

In [13]:
d1['Epitope', 'Parent Species'].unique()

array(['Hepatitis B virus', 'Mycobacterium tuberculosis',
       'Influenza A virus', 'Dengue virus', 'Borreliella burgdorferi',
       'Hepacivirus C', 'Streptococcus pyogenes', 'Plasmodium falciparum',
       'Plasmodium vivax', 'Leishmania donovani', 'Trypanosoma cruzi',
       'Human gammaherpesvirus 4', 'Chlamydia trachomatis',
       'Human betaherpesvirus 5', 'Measles morbillivirus',
       'Severe acute respiratory syndrome-related coronavirus',
       'Vaccinia virus', 'Triticum aestivum', 'Human metapneumovirus',
       'Primate T-lymphotropic virus 1', 'Mus musculus',
       'Mycobacterium kansasii', 'Cavia porcellus', 'Toxoplasma gondii',
       'Human alphaherpesvirus 1', 'Alphapapillomavirus 9',
       'Mycobacterium leprae', 'Leishmania major',
       'Human mastadenovirus C', 'Bacteroides stercoris'], dtype=object)

# Hamming distance calculation

In [28]:
humansample = random.sample(human9, 100000)
points = np.asarray([map_aatonumber(h) for h in humansample])


In [29]:
#def hamming(s1, s2):
#    return sum(c1==c2 for c1, c2 in zip(s1, s2))
def mindist(x, sample):
    return min(Levenshtein.hamming(s, x) for s in sample)

In [30]:
mindist('AAACCCAAA', humansample)

3

In [31]:
%timeit -t mindist('AAACCCAAA', humansample)

55.1 ms ± 2.34 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [32]:
bt = sklearn.neighbors.BallTree(points, metric='hamming')

In [33]:
def mindist_sklearn(x, tree):
    d, i = tree.query(map_aatonumber(x).reshape(1, -1))
    return int(d*len(x))

In [34]:
mindist_sklearn('AAACCCAAA', bt)

3

In [35]:
%timeit -t mindist_sklearn('AAACCCAAA', bt)

4.77 ms ± 838 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## on all 9mers

In [43]:
mindist('AAACCCAAA', human9)

2

In [46]:
%timeit -t mindist('AAACCCAAA', human9)

4.09 s ± 6.79 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
human9_number = np.asarray([map_aatonumber(h) for h in human9])

In [52]:
pointss = np.array_split(human9_number, 100)

In [53]:
bts = [sklearn.neighbors.BallTree(points, metric='hamming') for points in pointss]

In [54]:
def mindist_sklearn_chunked(x, trees):
    d = min(bt.query(map_aatonumber(x).reshape(1, -1))[0] for bt in trees)
    return int(d*len(x))

In [55]:
mindist_sklearn_chunked('AAACCCAAA', bts)

2

In [56]:
%timeit -t mindist_sklearn_chunked('AAACCCAAA', bts)

806 ms ± 75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
