In [336]:
import pandas as pd
import numpy as np
from glob import glob
import ahocorasick
from Bio import pairwise2,SeqIO
from Bio.pairwise2 import format_alignment
from tqdm import tqdm
import ast
import swifter
import datetime
import multiprocessing as mp
from pandarallel import pandarallel
from tqdm import tqdm
import os
import re
from multiprocessing import Manager
import warnings
warnings.filterwarnings('ignore')
tqdm.pandas()

In [3]:
os.chdir("/home1/wangd/jupyter_code")

In [35]:
final_pep_gene_symbol = pd.read_table("count-tables/peps_gene_symbol.tsv")

In [36]:
final_pep_gene_symbol_no_gene_symbol = final_pep_gene_symbol[final_pep_gene_symbol["gene_symbol"].isna()]
final_pep_gene_symbol_no_gene_symbol_set = set(final_pep_gene_symbol_no_gene_symbol["peptide"])
len(final_pep_gene_symbol_no_gene_symbol_set)

46

In [37]:
all_res_peps = set(final_pep_gene_symbol["peptide"])
print("Number of final peptides: {}".format(len(all_res_peps)))

Number of final peptides: 4991


In [38]:
gene_symbol_proteins = {}
proteins_info = {}
for record in SeqIO.parse("protein_db/Homo_sapiens-GRCh38_all-pep.fa", 'fasta'):
    gene_symbol_list = [x for x in record.description.split(" ") if x.startswith('gene_symbol:')]
    if gene_symbol_list:
        gene_symbol = gene_symbol_list[0].split(':')[-1]
    else:
        continue
    try:
        gene_symbol_proteins[gene_symbol].append(str(record.seq) + ';' + str(record.description))
    except KeyError:
        gene_symbol_proteins[gene_symbol] = [str(record.seq) + ';' + str(record.description)]

    try:
        proteins_info[str(record.seq)].append(str(record.description))
    except KeyError:
        proteins_info[str(record.seq)] = [str(record.description)]

In [39]:
seq_gene_symbol_set = set()
seq_no_gene_symbol_set = set()
for index,row in final_pep_gene_symbol.iterrows():
    try:
        gene_symbols = row["gene_symbol"].split(",")
        for gene_symbol in gene_symbols:
            seq_gene_symbol_set.add(row["peptide"] + '+' + gene_symbol)
    except AttributeError:
        seq_no_gene_symbol_set.add(row["peptide"])

print("There are {} peptides with no corresponding gene symbol found.".format(len(seq_no_gene_symbol_set)))
for seq in seq_no_gene_symbol_set:
    print(seq)

There are 46 peptides with no corresponding gene symbol found.
SSPVFQIPKNDNIPEQDSLGLSNLQK
LEEECEGREPGLETGTQAADCK
LSCAASGFTVSSNYTSWVR
ASQSVSSSYLAWYQQTPGQAPR
GLEWVTIISYDGSIQYYADSVK
STFGGGTK
ASRSVSSNLAWYQQKPGQAPR
ASGYTYTDYYMHWVR
TPESFLGPNAALVDLDSLVSRPGPTLPGAK
ALEWLAVIDWDDDK
AEDTAVYFCAK
ASGYTFTNNGITWVR
LSCAVSGFTFSSYAMHWVR
LLIYGNTNRPSGVPDR
NTLYHQMNSLRAEDTAVYYCAR
KTPESFLGPNAALVDLDSLVSRPGPTLPGAK
TVAPPVAGPSVFLFPPKPK
NQVVLTMTNMNPVDTATYYCAR
LLIYGNTNRPSGVPDRFSGSK
NDNIPEQDSLGLSNLQK
DTSISTAYMDLSR
LEWMGWINTGNGNTK
ESGPVLVKPTETLTLTCTISGFSLSNAR
NLVGNAGALHYSDEVEIIQGLTR
APKVLIYGASSLQSGVPSR
VLIYGASSLQSGVPSR
SAGGGTYYADSVK
TGEVVLTQSPGTLSLSPGER
YKSDSDNQQGSGVPSR
DLCPLLSEHGLQCSLEPHLNSNLCVYCCK
GLEWVALISYDGSDK
ANGGTTDYAAPVK
SKANGGTTDYAAPVK
SEDTAAYYCAR
MSKDALNLVQMQEQTLQLEQQSK
VDKTVAPPVAGPSVFLFPPKPK
LSCAASGFTFSTYGMHWVR
SDSDNQQGSGVPSR
ASQSISTWLAWYQQKPGK
WTAVVVPSGEEQR
GLEWVALISYDGSDKYYADSVK
QAPGQGLEWMGWISAYNGDTNYAQK
ASQSISTWLAWYQQK
IIPILGTANYAQKFQGR
LEEECEGREPGLETGTQAADCKDAPLK
LLIFGASTR


In [40]:
gene_symbol_set = set()
for i in seq_gene_symbol_set:
    if i.split("+")[1]:
        gene_symbol_set.add(i.split("+")[1])

In [41]:
len(gene_symbol_set)

2462

In [42]:
def get_details(start, pep_in_caonical, non_canonical_pep_seq):
    res = []
    i = 0
    j = 0
    for AA1, AA2 in zip(pep_in_caonical, non_canonical_pep_seq):
        i += 1
        j += 1
        if AA1 == AA2:
            continue
        if AA1 == '-' and AA2 != '-':
            # insertion
            res.append(str(start + i) + "-" + ">" + AA2)
            i -= 1
        elif AA2 == '-' and AA1 != '-':
            # deletion
            res.append(str(start + i) + AA1 + ">" + "-")
            j -= 1
        else:
            # substitution
            res.append(str(start + i) + AA1 + ">" + AA2)
    return res

In [43]:
def get_blast_canonical_mismatch1or2_info(pep,prot):
    non_canonical_pep_seq = pep
    canonical_seq = prot

    alignments = pairwise2.align.localms(sequenceA = canonical_seq,sequenceB = non_canonical_pep_seq,match = 1,mismatch = -1,open = -1,extend = -1,one_alignment_only=True)
    alignment = alignments[0]
    start = alignment.start
    end = alignment.end

    res = get_details(start,alignment.seqA[start:end], alignment.seqB[start:end])

    if len(res) == 0:
        score = alignment.score
        mismatch_nums = int(len(non_canonical_pep_seq) - score)
        if mismatch_nums in [1,2]:
            res_left = get_details(start-mismatch_nums,alignment.seqA[start-mismatch_nums:end], alignment.seqB[start-mismatch_nums:end])
            res_right = get_details(start,alignment.seqA[start:end+mismatch_nums], alignment.seqB[start:end+mismatch_nums])
            if len(res_left) < len(res_right):
                return res_left
            elif len(res_left) > len(res_right):
                return res_right
            else:
                l_num = 0
                r_num = 0
                for l in res_left:
                    if "-" in l:
                        l_num+=1
                for r in res_right:
                    if "-" in r:
                        r_num+=1
                if l_num > r_num:
                    return res_right
                else:
                    return res_left
        else:
            return ""
    elif len(res) in [1,2]:
        return res
    else:
        return ""

In [45]:
blast_infos = dict()
blast_infos_details = dict()
gene_symbol_no_protein_set = set()
for seq_gene_symbol in tqdm(seq_gene_symbol_set):
    pep_seq = seq_gene_symbol.split("+")[0]
    gene_symbol = seq_gene_symbol.split("+")[1]
    if gene_symbol in gene_symbol_proteins.keys():
        for info in gene_symbol_proteins[gene_symbol]:
            prot_seq = info.split(";")[0]
            prot_info = info.split(";")[1]

            alignments_score = pairwise2.align.localms(sequenceA = prot_seq,sequenceB = pep_seq,match = 1,mismatch = -1,open = -1,extend = -1,score_only = True)
            if alignments_score >= len(pep_seq) - 4:
                res = get_blast_canonical_mismatch1or2_info(pep_seq,prot_seq)
                if res:
                    blast_infos_details_key = pep_seq + ";" + prot_seq
                    blast_infos_details[blast_infos_details_key] = res

                    try:
                        blast_infos[seq_gene_symbol].append(info)
                    except KeyError:
                        blast_infos[seq_gene_symbol] = [info]
    else:
        gene_symbol_no_protein_set.add(gene_symbol)
print("There are {} gene_symbols with no corresponding canonical protein sequence found.".format(len(gene_symbol_no_protein_set)))
for gene_symbol in gene_symbol_no_protein_set:
    print(gene_symbol)

100%|██████████| 5022/5022 [58:55<00:00,  1.42it/s]   

There are 22 gene_symbols with no corresponding canonical protein sequence found.
UHRF1BP1
apol1
FAM160B1
FAM160A2
WDR92
H2BS1
GATD3A
H3-2
FAM160B2
CBWD2
BTBD11
DDX58
SKIV2L
GATD3B
Pwp2
RPS27AP5
LRRC6
srpk1a
EEF1AKNMT
GBA
ELOA3CP
FAM207A





#### Peptide + gene_symbol + protein

In [None]:
peptide_gene_symbol_protein = dict()
for key, value in blast_infos.items():
    prot_set = set()

    pep_seq = key.split("+")[0]
    gene_symbol = key.split("+")[1]

    for info in value:
        prot_set.add(info.split(";")[0])

    if peptide_gene_symbol_protein.get(pep_seq):
        peptide_gene_symbol_protein[pep_seq][gene_symbol] = prot_set
    else:
        peptide_gene_symbol_protein[pep_seq] = {gene_symbol:prot_set}

In [None]:
df = pd.DataFrame([(peptide, gene_symbol, ",".join(protein)) for peptide, inner_dict in peptide_gene_symbol_protein.items() for gene_symbol, protein in inner_dict.items()],
                  columns=['peptide', 'gene_symbol', 'protein'])
df.to_csv("blast_canonical-count-tables/1-2mismatches_peptides/peptide_gene_symbol_protein.tsv", header=1, sep="\t", index = None)

In [None]:
df["canonical_protein_seq"] = df["protein"].apply(lambda x : x.split(","))
df = df.explode("canonical_protein_seq", ignore_index=True)
df = df.drop("protein", axis=1)

In [None]:
def get_blast_infos_details(x):
    return blast_infos_details[x["peptide"] + ";" + x["canonical_protein_seq"]]

df["mismatch_info"] = df.apply(lambda x: get_blast_infos_details(x), axis=1)
df["number_mismatches"] = df.apply(lambda x: len(x["mismatch_info"]), axis=1)
df["mismatch_info"] = df.apply(lambda x: ",".join(x["mismatch_info"]), axis=1)

In [None]:
def get_protein_infos(canonical_protein_seq):
    canonical_protein_IDs = set()
    canonical_transcript_IDs = set()
    gene_IDs = set()
    for info in proteins_info[canonical_protein_seq]:
        canonical_protein_IDs.add(info.split(" ")[0])
        canonical_transcript_IDs.add([x for x in info.split(" ") if x.startswith('transcript:')][0].split(':')[-1])
        gene_IDs.add([x for x in info.split(" ") if x.startswith('gene:')][0].split(':')[-1])
    return [",".join(canonical_protein_IDs), ",".join(canonical_transcript_IDs), ",".join(gene_IDs)]
df["canonical_protein_ID"] = df.apply(lambda x: get_protein_infos(x["canonical_protein_seq"])[0], axis=1)
df["canonical_transcript_ID"] = df.apply(lambda x: get_protein_infos(x["canonical_protein_seq"])[1], axis=1)
df["gene_ID"] = df.apply(lambda x: get_protein_infos(x["canonical_protein_seq"])[2], axis=1)

In [None]:
df.to_csv("blast_canonical-count-tables/1-2mismatches_peptides_match_info/peptide_gene_symbol_protein_matching_info.tsv", header=1, sep="\t", index = None)

#### Add gca protein blast info

In [4]:
peptide_blast_res = pd.read_table("blast_canonical-count-tables/1-2mismatches_peptides_match_info/peptide_gene_symbol_protein_matching_info.tsv")

In [5]:
peptide_blast_res

Unnamed: 0,peptide,gene_symbol,canonical_protein_seq,mismatch_info,number_mismatches,canonical_protein_ID,canonical_transcript_ID,gene_ID
0,LINLGGENIR,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,2188Q>R,1,ENSP00000422898.2,ENST00000512836.6,ENSG00000172752.16
1,LINLGGENIR,COL6A5,NQYPPPMLEDACRLINLGGENIQNDGFQFVTELQEDFLGGNGFIGQ...,23Q>R,1,ENSP00000424968.1,ENST00000512482.1,ENSG00000172752.16
2,LINLGGENIR,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,2188Q>R,1,ENSP00000362250.5,ENST00000373157.9,ENSG00000172752.16
3,LINLGGENIR,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,2188Q>R,1,ENSP00000309762.7,ENST00000312481.11,ENSG00000172752.16
4,DFAHMLLLK,CYP27A1,MAALGCARLRWALRGAGRGLCPHGARAKAAIPAALPSDKATGAPGA...,384P>L,1,ENSP00000258415.4,ENST00000258415.9,ENSG00000135929.9
...,...,...,...,...,...,...,...,...
16149,VLLTCWDPEENKMAPCEFGVLNNLANVLSQHLNQK,HLA-DMB,MITFLPLLLGLSLGCTGAGGFVAHVESTCLLDDAGTPKDFTYCISF...,71S>N,1,"ENSP00000399130.2,ENSP00000402800.2,ENSP000003...","ENST00000428864.6,ENST00000447454.6,ENST000004...","ENSG00000242092.8,ENSG00000239329.8,ENSG000002..."
16150,VLLTCWDPEENKMAPCEFGVLNNLANVLSQHLNQK,HLA-DMB,MITFLPLLLGLSLGCTGAGGFVAHVESTCLLDDAGTPKDFTYCISF...,71S>N,1,"ENSP00000393646.2,ENSP00000378723.3,ENSP000004...","ENST00000428420.6,ENST00000383231.6,ENST000003...","ENSG00000242092.8,ENSG00000239329.8,ENSG000002..."
16151,QFRFDPQFALTNIAVTK,HLA-DQA1,MILNKALMLGALALTTVMSPCGGEDIVADHVASYGVNLYQSYGPSG...,91L>T,1,"ENSP00000382586.1,ENSP00000372738.2,ENSP000003...","ENST00000399675.5,ENST00000399678.5,ENST000003...","ENSG00000257473.7,ENSG00000206305.12"
16152,LEAEGEAMEDAAAPGNDRGGTQEPAPVPAEPFDNTTYK,NDUFV3,MAAPCLLRQGRAGALKTMLQEAQVFRGLASTVSLSAESGKSEKGQP...,415D>N,1,ENSP00000346196.2,ENST00000354250.7,ENSG00000160194.18


In [6]:
peptides_blast = set(peptide_blast_res["peptide"])

In [7]:
auto = ahocorasick.Automaton()

for seq in peptides_blast:
    auto.add_word(seq, seq)

auto.make_automaton()

peps_prots = {}
for prot_file in tqdm(glob('protein_db/GCA_peps/*.fa')):
    prot_seqs = set()
    prots_dict = {}
    parsed_file = SeqIO.parse(prot_file, 'fasta')
    sample_name = prot_file.split('/')[-1].split('-')[1]

    for record in parsed_file:
        try:
            prots_dict[str(record.seq)].append(sample_name + ' ' +  str(record.id) + ' ' +  str(record.seq) + ' ' + str(record.description))
        except KeyError:
            prots_dict[str(record.seq)] = [sample_name + ' ' + str(record.id) + ' ' +  str(record.seq) + ' ' + str(record.description)]

    found_prots = set()

    for prot_seq in prots_dict.keys():
        for end_ind, found in auto.iter(prot_seq):
            found_prots.add(found)
            
            infos_list = []
            for prot_info in prots_dict[prot_seq]:
                infos_old = prot_info.split(" ")
                try:
                    gene_symbol = [x for x in infos_old if x.startswith('gene_symbol:')][0].split(':')[-1]
                    infos = [infos_old[0], infos_old[2], gene_symbol]
                except IndexError:
                    infos = ' '.join([infos_old[0], infos_old[2], "no_gene_symbol"])
                infos_list.append(infos)
            
            try:
                peps_prots[found].append(infos_list)
            except KeyError:
                peps_prots[found] = [infos_list]

100%|██████████| 97/97 [03:27<00:00,  2.14s/it]


In [8]:
def gca_prot_blast_canonical_info(gca_prots,canonical_prot):
    need_blast_again = set()
    score = float('-inf')
    for gca_prot in gca_prots:
        alignment_score = pairwise2.align.globalms(sequenceA = canonical_prot,sequenceB = gca_prot, match = 1,mismatch = -1,open = -1,extend = -1,score_only = True)
        if alignment_score > score:
            need_blast_again = {gca_prot}
            score = alignment_score
        elif alignment_score == score:
            need_blast_again.add(gca_prot)
        else:
            continue
    final_res = []
    for gca_prot in need_blast_again:
        alignments = pairwise2.align.globalms(sequenceA = canonical_prot,sequenceB = gca_prot, match = 1,mismatch = -1,open = -1,extend = -1,one_alignment_only=True)
        alignment = alignments[0]
        res = []
        i = 0
        j = 0
        for AA1, AA2 in zip(alignment.seqA, alignment.seqB):
            i += 1
            j += 1
            if AA1 == AA2:
                continue
            if AA1 == '-' and AA2 != '-':
                # insertion
                res.append(str(i) + "-" + ">" + AA2)
                i -= 1
            elif AA2 == '-' and AA1 != '-':
                # deletion
                res.append(str(i) + AA1 + ">" + "-")
                j -= 1
            else:
                # substitution
                res.append(str(i) + AA1 + ">" + AA2)
        final_res.append([gca_prot,res,alignment.score])
    return final_res

In [9]:
def gca_prot_blast_with_canonical_prot(x):
    canonical_prot = x["canonical_protein_seq"]
    gene_symbol_need = x["gene_symbol"]
    peptide = x["peptide"]
    gca_prot_sample = dict()
    for infos_list in peps_prots[peptide]:
        for infos in infos_list:
            sample = infos[0]
            gca_prot = infos[1]
            gene_symbol = infos[2]

            if gene_symbol == gene_symbol_need:
                try:
                    gca_prot_sample[gca_prot].append(sample)
                except KeyError:
                    gca_prot_sample[gca_prot] = [sample]
    mismatch_info = gca_prot_blast_canonical_info(gca_prot_sample.keys(),canonical_prot)
    for info in mismatch_info:
        info.append(gca_prot_sample[info[0]])
    return mismatch_info

In [10]:
peptide_blast_res

Unnamed: 0,peptide,gene_symbol,canonical_protein_seq,mismatch_info,number_mismatches,canonical_protein_ID,canonical_transcript_ID,gene_ID
0,LINLGGENIR,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,2188Q>R,1,ENSP00000422898.2,ENST00000512836.6,ENSG00000172752.16
1,LINLGGENIR,COL6A5,NQYPPPMLEDACRLINLGGENIQNDGFQFVTELQEDFLGGNGFIGQ...,23Q>R,1,ENSP00000424968.1,ENST00000512482.1,ENSG00000172752.16
2,LINLGGENIR,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,2188Q>R,1,ENSP00000362250.5,ENST00000373157.9,ENSG00000172752.16
3,LINLGGENIR,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,2188Q>R,1,ENSP00000309762.7,ENST00000312481.11,ENSG00000172752.16
4,DFAHMLLLK,CYP27A1,MAALGCARLRWALRGAGRGLCPHGARAKAAIPAALPSDKATGAPGA...,384P>L,1,ENSP00000258415.4,ENST00000258415.9,ENSG00000135929.9
...,...,...,...,...,...,...,...,...
16149,VLLTCWDPEENKMAPCEFGVLNNLANVLSQHLNQK,HLA-DMB,MITFLPLLLGLSLGCTGAGGFVAHVESTCLLDDAGTPKDFTYCISF...,71S>N,1,"ENSP00000399130.2,ENSP00000402800.2,ENSP000003...","ENST00000428864.6,ENST00000447454.6,ENST000004...","ENSG00000242092.8,ENSG00000239329.8,ENSG000002..."
16150,VLLTCWDPEENKMAPCEFGVLNNLANVLSQHLNQK,HLA-DMB,MITFLPLLLGLSLGCTGAGGFVAHVESTCLLDDAGTPKDFTYCISF...,71S>N,1,"ENSP00000393646.2,ENSP00000378723.3,ENSP000004...","ENST00000428420.6,ENST00000383231.6,ENST000003...","ENSG00000242092.8,ENSG00000239329.8,ENSG000002..."
16151,QFRFDPQFALTNIAVTK,HLA-DQA1,MILNKALMLGALALTTVMSPCGGEDIVADHVASYGVNLYQSYGPSG...,91L>T,1,"ENSP00000382586.1,ENSP00000372738.2,ENSP000003...","ENST00000399675.5,ENST00000399678.5,ENST000003...","ENSG00000257473.7,ENSG00000206305.12"
16152,LEAEGEAMEDAAAPGNDRGGTQEPAPVPAEPFDNTTYK,NDUFV3,MAAPCLLRQGRAGALKTMLQEAQVFRGLASTVSLSAESGKSEKGQP...,415D>N,1,ENSP00000346196.2,ENST00000354250.7,ENSG00000160194.18


In [11]:
start_time = datetime.datetime.now()
print(start_time)

# tqdm.pandas()
# peptide_blast_res["GCA_protein_seq"] = peptide_blast_res.progress_apply(gca_prot_blast_with_canonical_prot,axis=1)

pandarallel.initialize(nb_workers=5, use_memory_fs=False, progress_bar=True)
peptide_blast_res["GCA_protein_seq"] = peptide_blast_res.parallel_apply(gca_prot_blast_with_canonical_prot,axis=1)

end_time = datetime.datetime.now()
print(end_time)
print(end_time - start_time)

2024-05-02 16:05:16.541568
INFO: Pandarallel will run on 5 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3231), Label(value='0 / 3231'))), …

2024-05-03 21:17:06.422416
1 day, 5:11:49.880848


In [12]:
peptide_blast_res

Unnamed: 0,peptide,gene_symbol,canonical_protein_seq,mismatch_info,number_mismatches,canonical_protein_ID,canonical_transcript_ID,gene_ID,GCA_protein_seq
0,LINLGGENIR,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,2188Q>R,1,ENSP00000422898.2,ENST00000512836.6,ENSG00000172752.16,[[MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPK...
1,LINLGGENIR,COL6A5,NQYPPPMLEDACRLINLGGENIQNDGFQFVTELQEDFLGGNGFIGQ...,23Q>R,1,ENSP00000424968.1,ENST00000512482.1,ENSG00000172752.16,[[NQYPPPMLEDACRLINLGGENIRNDGFQFVTELQEDFLGGNGFI...
2,LINLGGENIR,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,2188Q>R,1,ENSP00000362250.5,ENST00000373157.9,ENSG00000172752.16,[[MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPK...
3,LINLGGENIR,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,2188Q>R,1,ENSP00000309762.7,ENST00000312481.11,ENSG00000172752.16,[[MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPK...
4,DFAHMLLLK,CYP27A1,MAALGCARLRWALRGAGRGLCPHGARAKAAIPAALPSDKATGAPGA...,384P>L,1,ENSP00000258415.4,ENST00000258415.9,ENSG00000135929.9,[[MAALGCARLRWALRGAGRGLCPHGARAKAAIPAALPSDKATGAP...
...,...,...,...,...,...,...,...,...,...
16149,VLLTCWDPEENKMAPCEFGVLNNLANVLSQHLNQK,HLA-DMB,MITFLPLLLGLSLGCTGAGGFVAHVESTCLLDDAGTPKDFTYCISF...,71S>N,1,"ENSP00000399130.2,ENSP00000402800.2,ENSP000003...","ENST00000428864.6,ENST00000447454.6,ENST000004...","ENSG00000242092.8,ENSG00000239329.8,ENSG000002...",[[MITFLPLLLGLSLGCTGAGGFVAHVESACLLDDAGTPKDFTYCI...
16150,VLLTCWDPEENKMAPCEFGVLNNLANVLSQHLNQK,HLA-DMB,MITFLPLLLGLSLGCTGAGGFVAHVESTCLLDDAGTPKDFTYCISF...,71S>N,1,"ENSP00000393646.2,ENSP00000378723.3,ENSP000004...","ENST00000428420.6,ENST00000383231.6,ENST000003...","ENSG00000242092.8,ENSG00000239329.8,ENSG000002...",[[MITFLPLLLGLSLGCTGAGGFVAHVESACLLDDAGTPKDFTYCI...
16151,QFRFDPQFALTNIAVTK,HLA-DQA1,MILNKALMLGALALTTVMSPCGGEDIVADHVASYGVNLYQSYGPSG...,91L>T,1,"ENSP00000382586.1,ENSP00000372738.2,ENSP000003...","ENST00000399675.5,ENST00000399678.5,ENST000003...","ENSG00000257473.7,ENSG00000206305.12",[[MILNKALLLGALALTTVMSPCGGEDIVADHVASYGVNLYQSYGP...
16152,LEAEGEAMEDAAAPGNDRGGTQEPAPVPAEPFDNTTYK,NDUFV3,MAAPCLLRQGRAGALKTMLQEAQVFRGLASTVSLSAESGKSEKGQP...,415D>N,1,ENSP00000346196.2,ENST00000354250.7,ENSG00000160194.18,[[MAAPCLLRQGRAGALKTMLQEAQVFRGLASTVSLSAESGKSEKG...


In [13]:
peptide_blast_res = peptide_blast_res.explode("GCA_protein_seq", ignore_index=True)

In [14]:
peptide_blast_res.to_csv("blast_canonical-count-tables/1-2mismatches_peptides_match_info/peptide_gene_symbol_protein_matching_info_add.tsv", header=1, sep="\t", index = None)

####  Create gca protein blast info tsv

In [15]:
peptide_blast_res = pd.read_table("blast_canonical-count-tables/1-2mismatches_peptides_match_info/peptide_gene_symbol_protein_matching_info_add.tsv")

In [16]:
peptide_blast_res["GCA_protein_seq"] = peptide_blast_res["GCA_protein_seq"].apply(ast.literal_eval)

In [17]:
peptide_blast_res["samples"] = peptide_blast_res["GCA_protein_seq"].apply(lambda x: x[3])
peptide_blast_res["protein_mismatch_info"] = peptide_blast_res["GCA_protein_seq"].apply(lambda x: x[1])
peptide_blast_res["protein_number_mismatches"] = peptide_blast_res["protein_mismatch_info"].apply(lambda x:len(x))
peptide_blast_res["identity score"] = peptide_blast_res["GCA_protein_seq"].apply(lambda x: x[2])
peptide_blast_res["GCA_protein_seq"] = peptide_blast_res["GCA_protein_seq"].apply(lambda x: x[0])

peptide_blast_res["samples"] = peptide_blast_res["samples"].apply(lambda x: ",".join(x))
peptide_blast_res["protein_mismatch_info"] = peptide_blast_res["protein_mismatch_info"].apply(lambda x: ",".join(x))

In [18]:
gca_prot_peptide = dict()
for index, row in peptide_blast_res.iterrows():
    peptide_mismatch_info = [row["peptide"], row["mismatch_info"]]
    try:
        gca_prot_peptide[row["GCA_protein_seq"]].append(peptide_mismatch_info)
    except KeyError:
        gca_prot_peptide[row["GCA_protein_seq"]] = [peptide_mismatch_info]

In [19]:
peptide_blast_res = peptide_blast_res.drop(columns=['peptide','mismatch_info','number_mismatches'])
peptide_blast_res = peptide_blast_res.drop_duplicates()

In [20]:
peptide_blast_res = peptide_blast_res.rename(columns={'samples': 'sample_ids'})
peptide_blast_res = peptide_blast_res.rename(columns={'canonical_protein_seq': 'matching_canonical_protein_seq'})
peptide_blast_res = peptide_blast_res.rename(columns={'protein_mismatch_info': 'all_mismatch_info'})
peptide_blast_res = peptide_blast_res.rename(columns={'protein_number_mismatches': 'number_mismatches'})

In [21]:
def get_peptides(prot):
    peptides = set()
    for info in gca_prot_peptide[prot]:
        peptides.add(info[0] + "(" + info[1] + ")")
    return ";".join(peptides)
def get_peptide_mismatch_details(prot):
    mismatchs = set()
    for info in gca_prot_peptide[prot]:
        mismatchs = mismatchs.union(set(info[1].split(",")))
    return ",".join(mismatchs)
def get_other_mismatchs(x):
    return set(x["all_mismatch_info"].split(",")) - set(x["peptides_mismatch_info"].split(","))
peptide_blast_res["list of peptides belong to this protein"] = peptide_blast_res["GCA_protein_seq"].apply(get_peptides)
peptide_blast_res["peptides_mismatch_info"] = peptide_blast_res["GCA_protein_seq"].apply(get_peptide_mismatch_details)
peptide_blast_res["other_mismatch_info"] = peptide_blast_res.apply(lambda x: ",".join(get_other_mismatchs(x)), axis = 1)

In [22]:
peptide_blast_res

Unnamed: 0,gene_symbol,matching_canonical_protein_seq,canonical_protein_ID,canonical_transcript_ID,gene_ID,GCA_protein_seq,sample_ids,all_mismatch_info,number_mismatches,identity score,list of peptides belong to this protein,peptides_mismatch_info,other_mismatch_info
0,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,ENSP00000422898.2,ENST00000512836.6,ENSG00000172752.16,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"GCA_018503285.1,GCA_018472595.1,GCA_009914755.4","2188Q>R,2205G>D,2526W>L,2527->L,2527->P,2527->...",91,2432.0,LINLGGENIR(2188Q>R),2188Q>R,"2527->K,2527->M,2527->G,2527->P,2527->H,2527->..."
1,COL6A5,NQYPPPMLEDACRLINLGGENIQNDGFQFVTELQEDFLGGNGFIGQ...,ENSP00000424968.1,ENST00000512482.1,ENSG00000172752.16,NQYPPPMLEDACRLINLGGENIRNDGFQFVTELQEDFLGGNGFIGQ...,"GCA_018469425.1,GCA_018504625.1,GCA_018503525....","23Q>R,424A>T",2,439.0,LINLGGENIR(23Q>R),23Q>R,424A>T
2,COL6A5,NQYPPPMLEDACRLINLGGENIQNDGFQFVTELQEDFLGGNGFIGQ...,ENSP00000424968.1,ENST00000512482.1,ENSG00000172752.16,NQYPPPMLEDACRLINLGGENIRNDGFQFVTELQEDFLGDNGFIGQ...,"GCA_018506975.1,GCA_018469965.1,GCA_018469935....","23Q>R,40G>D",2,439.0,LINLGGENIR(23Q>R),23Q>R,40G>D
3,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,ENSP00000362250.5,ENST00000373157.9,ENSG00000172752.16,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"GCA_018469935.1,GCA_018469665.1","2188Q>R,2205G>D,2608G>K,2610D>M,2611->E,2611->...",9,2598.0,LINLGGENIR(2188Q>R),2188Q>R,"2614S>I,2608G>K,2613S>D,2611->E,2205G>D,2610D>..."
4,COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,ENSP00000309762.7,ENST00000312481.11,ENSG00000172752.16,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"GCA_018469935.1,GCA_018469665.1","2188Q>R,2205G>D",2,2611.0,LINLGGENIR(2188Q>R),2188Q>R,2205G>D
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17506,TMPRSS11E,MMYRPDVVRARKRVCWEPWVIGLVIFISLIVLAVCIGLTVHYVRYN...,ENSP00000481425.1,ENST00000621758.3,ENSG00000274058.3,MMYRPDVVRARKRVCWEPWVIGLVIFISLIVLAVCIGLTVHYVRYN...,GCA_018504655.1,"303Y>C,349N>D",2,419.0,QAQVTLIDATTCNEPQAYDDAITPR(349N>D),349N>D,303Y>C
17507,TMPRSS11E,MMYRPDVVRARKRVCWEPWVIGLVIFISLIVLAVCIGLTVHYVRYN...,ENSP00000307519.4,ENST00000305363.9,ENSG00000087128.10,MMYRPDVVRARKRVCWEPWVIGLVIFISLIVLAVCIGLTVHYVRYN...,GCA_018504655.1,"158H>Q,303Y>C,349N>D",3,417.0,QAQVTLIDATTCNEPQAYDDAITPR(349N>D),349N>D,"158H>Q,303Y>C"
17514,GOLGA4,MFKKLKQKISEEQQQLQQALAPAQASSNSSTPTRMRSRTSSFTEQL...,ENSP00000403009.1,ENST00000429018.5,ENSG00000144674.19,MFKKLKQKISEEQQQLQQALAPAQASSNSSTPTRMRSRTSSFTEQL...,GCA_018467165.1,120Q>H,1,583.0,DLHQEAETYR(120Q>H),120Q>H,
17516,MYO1A,MPLLEGSVGVEDLVLLEPLVEESLLKNLQLRYENKEIYTYIGNVVI...,"ENSP00000393392.2,ENSP00000300119.3","ENST00000442789.6,ENST00000300119.8",ENSG00000166866.13,MPLLEGSVGVEDLVLLEPLVEESLLKNLQLRYENKEIYTYIGNVVI...,"GCA_018852605.1,GCA_018852605.1,GCA_021950905....",662G>E,1,1041.0,VLEELSMSSGELAFGK(662G>E),662G>E,


In [23]:
peptide_blast_res = peptide_blast_res[['GCA_protein_seq', 'sample_ids', 'list of peptides belong to this protein', 'gene_symbol', 'matching_canonical_protein_seq', 'all_mismatch_info', 'number_mismatches', 'identity score', 'peptides_mismatch_info', 'other_mismatch_info', 'canonical_protein_ID', 'canonical_transcript_ID','gene_ID']]

In [24]:
peptide_blast_res

Unnamed: 0,GCA_protein_seq,sample_ids,list of peptides belong to this protein,gene_symbol,matching_canonical_protein_seq,all_mismatch_info,number_mismatches,identity score,peptides_mismatch_info,other_mismatch_info,canonical_protein_ID,canonical_transcript_ID,gene_ID
0,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"GCA_018503285.1,GCA_018472595.1,GCA_009914755.4",LINLGGENIR(2188Q>R),COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"2188Q>R,2205G>D,2526W>L,2527->L,2527->P,2527->...",91,2432.0,2188Q>R,"2527->K,2527->M,2527->G,2527->P,2527->H,2527->...",ENSP00000422898.2,ENST00000512836.6,ENSG00000172752.16
1,NQYPPPMLEDACRLINLGGENIRNDGFQFVTELQEDFLGGNGFIGQ...,"GCA_018469425.1,GCA_018504625.1,GCA_018503525....",LINLGGENIR(23Q>R),COL6A5,NQYPPPMLEDACRLINLGGENIQNDGFQFVTELQEDFLGGNGFIGQ...,"23Q>R,424A>T",2,439.0,23Q>R,424A>T,ENSP00000424968.1,ENST00000512482.1,ENSG00000172752.16
2,NQYPPPMLEDACRLINLGGENIRNDGFQFVTELQEDFLGDNGFIGQ...,"GCA_018506975.1,GCA_018469965.1,GCA_018469935....",LINLGGENIR(23Q>R),COL6A5,NQYPPPMLEDACRLINLGGENIQNDGFQFVTELQEDFLGGNGFIGQ...,"23Q>R,40G>D",2,439.0,23Q>R,40G>D,ENSP00000424968.1,ENST00000512482.1,ENSG00000172752.16
3,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"GCA_018469935.1,GCA_018469665.1",LINLGGENIR(2188Q>R),COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"2188Q>R,2205G>D,2608G>K,2610D>M,2611->E,2611->...",9,2598.0,2188Q>R,"2614S>I,2608G>K,2613S>D,2611->E,2205G>D,2610D>...",ENSP00000362250.5,ENST00000373157.9,ENSG00000172752.16
4,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"GCA_018469935.1,GCA_018469665.1",LINLGGENIR(2188Q>R),COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"2188Q>R,2205G>D",2,2611.0,2188Q>R,2205G>D,ENSP00000309762.7,ENST00000312481.11,ENSG00000172752.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17506,MMYRPDVVRARKRVCWEPWVIGLVIFISLIVLAVCIGLTVHYVRYN...,GCA_018504655.1,QAQVTLIDATTCNEPQAYDDAITPR(349N>D),TMPRSS11E,MMYRPDVVRARKRVCWEPWVIGLVIFISLIVLAVCIGLTVHYVRYN...,"303Y>C,349N>D",2,419.0,349N>D,303Y>C,ENSP00000481425.1,ENST00000621758.3,ENSG00000274058.3
17507,MMYRPDVVRARKRVCWEPWVIGLVIFISLIVLAVCIGLTVHYVRYN...,GCA_018504655.1,QAQVTLIDATTCNEPQAYDDAITPR(349N>D),TMPRSS11E,MMYRPDVVRARKRVCWEPWVIGLVIFISLIVLAVCIGLTVHYVRYN...,"158H>Q,303Y>C,349N>D",3,417.0,349N>D,"158H>Q,303Y>C",ENSP00000307519.4,ENST00000305363.9,ENSG00000087128.10
17514,MFKKLKQKISEEQQQLQQALAPAQASSNSSTPTRMRSRTSSFTEQL...,GCA_018467165.1,DLHQEAETYR(120Q>H),GOLGA4,MFKKLKQKISEEQQQLQQALAPAQASSNSSTPTRMRSRTSSFTEQL...,120Q>H,1,583.0,120Q>H,,ENSP00000403009.1,ENST00000429018.5,ENSG00000144674.19
17516,MPLLEGSVGVEDLVLLEPLVEESLLKNLQLRYENKEIYTYIGNVVI...,"GCA_018852605.1,GCA_018852605.1,GCA_021950905....",VLEELSMSSGELAFGK(662G>E),MYO1A,MPLLEGSVGVEDLVLLEPLVEESLLKNLQLRYENKEIYTYIGNVVI...,662G>E,1,1041.0,662G>E,,"ENSP00000393392.2,ENSP00000300119.3","ENST00000442789.6,ENST00000300119.8",ENSG00000166866.13


In [25]:
peptide_blast_res.to_csv("blast_canonical-count-tables/1-2mismatches_peptides_match_info/GCA_canonical_prot_matching_info.tsv", header=1, sep="\t", index = None)

In [26]:
peptide_blast_res = pd.read_table("blast_canonical-count-tables/1-2mismatches_peptides_match_info/GCA_canonical_prot_matching_info.tsv")

In [27]:
peptide_blast_res[peptide_blast_res['number_mismatches']>10]

Unnamed: 0,GCA_protein_seq,sample_ids,list of peptides belong to this protein,gene_symbol,matching_canonical_protein_seq,all_mismatch_info,number_mismatches,identity score,peptides_mismatch_info,other_mismatch_info,canonical_protein_ID,canonical_transcript_ID,gene_ID
0,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"GCA_018503285.1,GCA_018472595.1,GCA_009914755.4",LINLGGENIR(2188Q>R),COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"2188Q>R,2205G>D,2526W>L,2527->L,2527->P,2527->...",91,2432.0,2188Q>R,"2527->K,2527->M,2527->G,2527->P,2527->H,2527->...",ENSP00000422898.2,ENST00000512836.6,ENSG00000172752.16
55,MSVSRTMDSCELDLVYVTERIIAVSFPSTANEENFRSNLREVAQML...,GCA_018467015.1,TMDSCELDLVYVTER(8E>-),TNS1,MSVSRTMEDSCELDLVYVTERIIAVSFPSTANEENFRSNLREVAQM...,"8E>-,528T>I,877A>-,878A>-,879Y>-,880N>-,882R>-...",33,1698.0,8E>-,"897L>-,889S>-,890G>-,887G>-,886I>-,901Q>-,883L...",ENSP00000516092.1,ENST00000705226.1,ENSG00000079308.21
56,MSVSRTMDSCELDLVYVTERIIAVSFPSTANEENFRSNLREVAQML...,"GCA_018469935.1,GCA_018504045.1",TMDSCELDLVYVTER(8E>-),TNS1,MSVSRTMEDSCELDLVYVTERIIAVSFPSTANEENFRSNLREVAQM...,"8E>-,528T>I,877A>-,878A>-,879Y>-,880N>-,882R>-...",33,1698.0,8E>-,"897L>-,889S>-,890G>-,887G>-,886I>-,901Q>-,883L...",ENSP00000516092.1,ENST00000705226.1,ENSG00000079308.21
61,MVAEVDSMPAASSVKKPFVLRSKMGKWCCHCFPCCRGSGKSNVGTS...,GCA_018470465.1,CPQALFQPCFLGMESCGIHETTFNSIMK(959E>Q),POTEE,MVVEVDSMPAASSVKKPFGLRSKMGKWCCRCFPCYRESGKSNVGTS...,"3V>A,18F>-,19G>-,20L>-,21R>-,22S>-,23K>-,24M>-...",61,953.0,959E>Q,"405L>P,28C>-,18F>-,51D>-,47G>-,29C>-,20L>-,49H...","ENSP00000439189.1,ENSP00000507284.1","ENST00000683005.1,ENST00000356920.9",ENSG00000188219.16
62,MVAEVDSMPAASSVKKPFVLRSKMGKWCCHCFPCCRGSGKSNVGTS...,"GCA_018504075.1,GCA_018473305.1,GCA_018504625....",CPQALFQPCFLGMESCGIHETTFNSIMK(959E>Q);CPQALFQPC...,POTEJ,MVAEVDSMPAASSVKKPFVLRSKMGKWCRHCFPCCRGSGKSNVGTS...,"29R>C,49Q>H,53T>A,397S>T,605R>K,613M>T,700C>R,...",11,1016.0,"922E>Q,959E>Q","605R>K,892M>T,802M>V,914S>G,397S>T,29R>C,49Q>H...",ENSP00000387176.1,ENST00000409602.2,ENSG00000222038.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12561,MSQGVRRAGAGQGVAAAVQLLVTLSFLRSVVEAQVTGVLDDCLCDI...,"GCA_018506975.1,GCA_018505855.1,GCA_018472685....",ELEVCEQANK(10D>V);ELEVCEQANK(129D>V),ERO1B,MANNTKELEDCEQANKLGAINSTLSNQSKEAFIDWARYDDSRDHFC...,"2->S,2->Q,2->G,2->V,2->R,2->R,3->G,3->A,3->G,3...",119,229.0,"129D>V,10D>V","8->V,3->C,9->C,8->N,2->R,9->K,3->A,13->T,12->C...",ENSP00000510551.1,ENST00000687487.1,ENSG00000086619.15
12569,MAQWEMLQNLDSPFQDQLHQLYSHSLLPVDIRQYLAVWIEDQNWQE...,GCA_018471065.1,IEEIMPNGDPLLAGHNTVDEVYVSR(782Q>H);IEEIMPNGDPLL...,STAT2,MAQWEMLQNLDSPFQDQLHQLYSHSLLPVDIRQYLAVWIEDQNWQE...,"405->L,405->V,405->E,405->Q,405->R,405->S,405-...",45,761.0,"826Q>H,782Q>H","405->V,405->H,405->F,405->I,405->T,405->P,405-...",ENSP00000513598.1,ENST00000698186.1,ENSG00000170581.15
12586,MSTTVNVDSLAEYEKSQIKRALELGTVMTVFSFRKSTPERRTVQVI...,GCA_018469705.1,DSSVFILGNTDRPDASAVYLR(244H>R),PLCG2,MSTTVNVDSLAEYEKSQIKRALELGTVMTVFSFRKSTPERRTVQVI...,"244H>R,398->S,398->F,398->P,398->V,398->I,398-...",845,-438.0,244H>R,"404->R,400->E,398->H,401->E,406->K,398->C,407-...","ENSP00000455533.2,ENSP00000513338.1","ENST00000697562.1,ENST00000563193.2",ENSG00000197943.12
12587,MSTTVNVDSLAEYEKSQIKRALELGTVMTVFSFRKSTPERRTVQVI...,GCA_018469705.1,DSSVFILGNTDRPDASAVYLR(177H>R);DSSVFILGNTDRPDAS...,PLCG2,MEIKEIRPGKNSKDFERAKAVRQKEDCCFTILYGTQFVLSTLSLAA...,"2->S,2->T,2->T,2->V,2->N,2->V,2->D,2->S,2->L,2...",68,1129.0,"244H>R,177H>R","8->V,8->L,7->E,6->V,6->L,7->M,5->A,8->W,2->T,2...","ENSP00000513350.1,ENSP00000513349.1,ENSP000005...","ENST00000697586.1,ENST00000697587.1,ENST000006...",ENSG00000197943.12


In [83]:
peptide_blast_res[peptide_blast_res['number_mismatches']<=10]['number_mismatches'].value_counts()

1     6289
2     2051
3      847
4      438
5      229
6      161
7      103
8       65
9       53
10      46
Name: number_mismatches, dtype: int64

In [30]:
peptide_blast_res['other_number_mismatches'] = peptide_blast_res['other_mismatch_info'].apply(lambda x: len(x.split(",")) if pd.notna(x) else 0)  

In [31]:
peptide_blast_res[peptide_blast_res['other_number_mismatches']>10]

Unnamed: 0,GCA_protein_seq,sample_ids,list of peptides belong to this protein,gene_symbol,matching_canonical_protein_seq,all_mismatch_info,number_mismatches,identity score,peptides_mismatch_info,other_mismatch_info,canonical_protein_ID,canonical_transcript_ID,gene_ID,other_number_mismatches
0,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"GCA_018503285.1,GCA_018472595.1,GCA_009914755.4",LINLGGENIR(2188Q>R),COL6A5,MKILLIIFVLIIWTETLADQSPGPGPVYADVVFLVDSSDHLGPKSF...,"2188Q>R,2205G>D,2526W>L,2527->L,2527->P,2527->...",91,2432.0,2188Q>R,"2527->K,2527->M,2527->G,2527->P,2527->H,2527->...",ENSP00000422898.2,ENST00000512836.6,ENSG00000172752.16,19
55,MSVSRTMDSCELDLVYVTERIIAVSFPSTANEENFRSNLREVAQML...,GCA_018467015.1,TMDSCELDLVYVTER(8E>-),TNS1,MSVSRTMEDSCELDLVYVTERIIAVSFPSTANEENFRSNLREVAQM...,"8E>-,528T>I,877A>-,878A>-,879Y>-,880N>-,882R>-...",33,1698.0,8E>-,"897L>-,889S>-,890G>-,887G>-,886I>-,901Q>-,883L...",ENSP00000516092.1,ENST00000705226.1,ENSG00000079308.21,32
56,MSVSRTMDSCELDLVYVTERIIAVSFPSTANEENFRSNLREVAQML...,"GCA_018469935.1,GCA_018504045.1",TMDSCELDLVYVTER(8E>-),TNS1,MSVSRTMEDSCELDLVYVTERIIAVSFPSTANEENFRSNLREVAQM...,"8E>-,528T>I,877A>-,878A>-,879Y>-,880N>-,882R>-...",33,1698.0,8E>-,"897L>-,889S>-,890G>-,887G>-,886I>-,901Q>-,883L...",ENSP00000516092.1,ENST00000705226.1,ENSG00000079308.21,32
61,MVAEVDSMPAASSVKKPFVLRSKMGKWCCHCFPCCRGSGKSNVGTS...,GCA_018470465.1,CPQALFQPCFLGMESCGIHETTFNSIMK(959E>Q),POTEE,MVVEVDSMPAASSVKKPFGLRSKMGKWCCRCFPCYRESGKSNVGTS...,"3V>A,18F>-,19G>-,20L>-,21R>-,22S>-,23K>-,24M>-...",61,953.0,959E>Q,"405L>P,28C>-,18F>-,51D>-,47G>-,29C>-,20L>-,49H...","ENSP00000439189.1,ENSP00000507284.1","ENST00000683005.1,ENST00000356920.9",ENSG00000188219.16,60
63,MVAEVDSMPAASSVKKPFVLRSKMGKWCCHCFPCCRGSGKSNVGTS...,GCA_018504375.1,CPQALFQPCFLGMESCGIHETTFNSIMK(959E>Q);CPQALFQPC...,POTEI,MVAEVDSMPAASSVKKPFVLRSKMGKWCRHCFPCCRGSGKSNVGTS...,"18F>-,19V>-,20L>-,21R>-,22S>-,23K>-,24M>-,25G>...",43,989.0,"922E>Q,959E>Q","28C>-,18F>-,51D>-,47G>-,20L>-,839M>V,44G>-,48D...",ENSP00000392718.2,ENST00000451531.7,ENSG00000196834.14,42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12561,MSQGVRRAGAGQGVAAAVQLLVTLSFLRSVVEAQVTGVLDDCLCDI...,"GCA_018506975.1,GCA_018505855.1,GCA_018472685....",ELEVCEQANK(10D>V);ELEVCEQANK(129D>V),ERO1B,MANNTKELEDCEQANKLGAINSTLSNQSKEAFIDWARYDDSRDHFC...,"2->S,2->Q,2->G,2->V,2->R,2->R,3->G,3->A,3->G,3...",119,229.0,"129D>V,10D>V","8->V,3->C,9->C,8->N,2->R,9->K,3->A,13->T,12->C...",ENSP00000510551.1,ENST00000687487.1,ENSG00000086619.15,64
12569,MAQWEMLQNLDSPFQDQLHQLYSHSLLPVDIRQYLAVWIEDQNWQE...,GCA_018471065.1,IEEIMPNGDPLLAGHNTVDEVYVSR(782Q>H);IEEIMPNGDPLL...,STAT2,MAQWEMLQNLDSPFQDQLHQLYSHSLLPVDIRQYLAVWIEDQNWQE...,"405->L,405->V,405->E,405->Q,405->R,405->S,405-...",45,761.0,"826Q>H,782Q>H","405->V,405->H,405->F,405->I,405->T,405->P,405-...",ENSP00000513598.1,ENST00000698186.1,ENSG00000170581.15,15
12586,MSTTVNVDSLAEYEKSQIKRALELGTVMTVFSFRKSTPERRTVQVI...,GCA_018469705.1,DSSVFILGNTDRPDASAVYLR(244H>R),PLCG2,MSTTVNVDSLAEYEKSQIKRALELGTVMTVFSFRKSTPERRTVQVI...,"244H>R,398->S,398->F,398->P,398->V,398->I,398-...",845,-438.0,244H>R,"404->R,400->E,398->H,401->E,406->K,398->C,407-...","ENSP00000455533.2,ENSP00000513338.1","ENST00000697562.1,ENST00000563193.2",ENSG00000197943.12,137
12587,MSTTVNVDSLAEYEKSQIKRALELGTVMTVFSFRKSTPERRTVQVI...,GCA_018469705.1,DSSVFILGNTDRPDASAVYLR(177H>R);DSSVFILGNTDRPDAS...,PLCG2,MEIKEIRPGKNSKDFERAKAVRQKEDCCFTILYGTQFVLSTLSLAA...,"2->S,2->T,2->T,2->V,2->N,2->V,2->D,2->S,2->L,2...",68,1129.0,"244H>R,177H>R","8->V,8->L,7->E,6->V,6->L,7->M,5->A,8->W,2->T,2...","ENSP00000513350.1,ENSP00000513349.1,ENSP000005...","ENST00000697586.1,ENST00000697587.1,ENST000006...",ENSG00000197943.12,45


In [84]:
peptide_blast_res[peptide_blast_res['other_number_mismatches']<=10]['other_number_mismatches'].value_counts()

0     6506
1     1971
2      820
3      404
4      192
5      151
6      120
7       62
8       61
10      48
9       48
Name: other_number_mismatches, dtype: int64


#### Protein + gene_symbol + peptide

In [46]:
protein_gene_symbol_peptide = dict()
for key, value in blast_infos.items():
    pep_seq = key.split("+")[0]
    gene_symbol = key.split("+")[1]
    
    for info in value:
        prot_seq = info.split(";")[0]
        
        if protein_gene_symbol_peptide.get(prot_seq):
            if protein_gene_symbol_peptide.get(prot_seq).get(gene_symbol):
                protein_gene_symbol_peptide[prot_seq][gene_symbol].add(pep_seq)
            else:
                protein_gene_symbol_peptide[prot_seq][gene_symbol] = {pep_seq}
        else:
            protein_gene_symbol_peptide[prot_seq] = {gene_symbol:{pep_seq}} 

In [47]:
df = pd.DataFrame([(protein, gene_symbol, ",".join(peptide)) for protein, inner_dict in protein_gene_symbol_peptide.items() for gene_symbol, peptide in inner_dict.items()], 
                  columns=['protein', 'gene_symbol', 'peptide'])
df.to_csv("blast_canonical-count-tables/1-2mismatches_peptides/protein_gene_symbol_peptide.tsv", header=1, sep="\t", index = None)

In [48]:
df

Unnamed: 0,protein,gene_symbol,peptide
0,MDGASAEQDGLQEDRSHSGPSSLPEAPLKPPGPLVPPDQQDKVQCA...,GOLGA3,"LQAEANDLQIR,EHLVQKLQAEANDLQIR,TEDSNAGNSGGNVLAP..."
1,SSQPATKTRLFSTLDPELMLNPENLPRASTLAMTKEYSFLRTSVPR...,GOLGA3,"LQAEANDLQIR,EHLVQKLQAEANDLQIR,TEDSNAGNSGGNVLAP..."
2,SSQPATKTRLFSTLDPELMLNPENLPRASTLAMTKEYSFLRTSVPR...,GOLGA3,"LQAEANDLQIR,EHLVQKLQAEANDLQIR,TEDSNAGNSGGNVLAP..."
3,SSQPATKTRLFSTLDPELMLNPENLPRASTLAMTKEYSFLRTSVPR...,GOLGA3,"LQAEANDLQIR,EHLVQKLQAEANDLQIR,TEDSNAGNSGGNVLAP..."
4,MDGASAEQDGLQEDRSHSGPSSLPEAPLKPPGPLVPPDQQDKVQCA...,GOLGA3,"LQAEANDLQIR,EHLVQKLQAEANDLQIR,TEDSNAGNSGGNVLAP..."
...,...,...,...
8078,MRLTPRALCSAAQAAWRENFPLCGRDVARWFPGHMAKGLKKMQSSL...,MTG1,VLTGTGNVNVIQPNYPAAAR
8079,MRLTPRALCSAAQAAWRENFPLCGRDVARWFPGHMAKGLKKMQSSL...,MTG1,VLTGTGNVNVIQPNYPAAAR
8080,MSEKVDWLQSQNGVCKVDVYSPGDNQAQDWKMDTSTDPVRVLSWLR...,AKAP3,AQLGNESSVDEVSFYANR
8081,MSEKVDWLQSQNGVCKVDVYSPGDNQAQDWKMDTSTDPVRVLSWLR...,AKAP3,AQLGNESSVDEVSFYANR


#### Gene_symbol + peptide + protein

In [49]:
gene_symbol_peptide_protein = dict()
for key, value in blast_infos.items():
    prot_set = set()
    
    pep_seq = key.split("+")[0]
    gene_symbol = key.split("+")[1]
    
    for info in value:
        prot_set.add(info.split(";")[0])

    if gene_symbol_peptide_protein.get(gene_symbol):
        gene_symbol_peptide_protein[gene_symbol][pep_seq] = prot_set
    else:
        gene_symbol_peptide_protein[gene_symbol] = {pep_seq:prot_set}

In [50]:
gene_symbol_peptide = dict()
gene_symbol_protein = dict()
for gene_symbol,value in gene_symbol_peptide_protein.items():
    for peptide,protein in value.items():
        if gene_symbol_peptide.get(gene_symbol):
            gene_symbol_peptide[gene_symbol].add(peptide)
        else:
            gene_symbol_peptide[gene_symbol] = {peptide}
        
        if gene_symbol_protein.get(gene_symbol):
            gene_symbol_protein[gene_symbol].union(protein)
        else:
            gene_symbol_protein[gene_symbol] = protein
            
gene_symbol_peptide_str = {key: ','.join(value) for key, value in gene_symbol_peptide.items()}
gene_symbol_protein_str = {key: ','.join(value) for key, value in gene_symbol_protein.items()}

In [51]:
df = pd.DataFrame({'gene_symbol': list(gene_symbol_peptide.keys()),
                   'peptide': list(gene_symbol_peptide_str.values()),
                   'protein': list(gene_symbol_protein_str.values())})
df.to_csv("blast_canonical-count-tables/1-2mismatches_peptides/gene_symbol_peptide_protein.tsv", header=1, sep="\t", index = None)

In [52]:
df

Unnamed: 0,gene_symbol,peptide,protein
0,GOLGA3,"LQAEANDLQIR,EHLVQKLQAEANDLQIR,TEDSNAGNSGGNVLAP...",SSQPATKTRLFSTLDPELMLNPENLPRASTLAMTKEYSFLRTSVPR...
1,EPB41L3,"METKTESSGIETEPTVHHLPLSTEKVVQETVLVQER,TESSGIETE...",MTTESGSDSESKPDQEAEPQEAAGAQGRAGAPVPEPPKEEQQQALE...
2,FNDC1,"APEQQPPPPVATSQHHPGPQSR,SQQSVSAEDEEEEDAGFFK",XPRHVKLLSTKMGLKVTWDPPKDATSRPVEHYNIAYGKSLKSLKYI...
3,IGKV1-27,"ASQGISNFLAWYQQKPGK,LLISAASTLQSGVPSR,ASQGISNFLA...",MDMRVPAQLLGLLLLWLPDTRCDIQMTQSPSSLSASVGDRVTITCR...
4,UGT2B11,LYKWISQNDLLGHPK,MTLKWTSVLLLIHLSCYFSSGSCGKVLVWAAEYSHWMNMKTILKEL...
...,...,...,...
2380,SLC18A1,NNCLQGTGFLEEETTR,MLRTILDAPQRLLKEGRASRQLVLVVVFVALLLDNMLFTVVVPIVP...
2381,SCARF1,EAEESTGPDEAEAPESFPAAASPGDSATGHR,MGLGLLLPLLLLWTRGTQGSELDPKGQHVCVASSPSAELQCCAGWR...
2382,MTG1,VLTGTGNVNVIQPNYPAAAR,MRLTPRALCSAAQAAWRENFPLCGRDVARWFPGHMAKGLKKMQSSL...
2383,AKAP3,AQLGNESSVDEVSFYANR,MSEKVDWLQSQNGVCKVDVYSPGDNQAQDWKMDTSTDPVRVLSWLR...


#### All 1-2 mismatch_nums Peptides Info Count

In [53]:
all_peptide = set()
all_gene_symbol = set()
all_protein = set()
all_transcript = set()
for key, value in blast_infos.items(): 
    pep_seq = key.split("+")[0]
    all_peptide.add(pep_seq)
    
    gene_symbol = key.split("+")[1]
    all_gene_symbol.add(gene_symbol)
    
    for info in value:
        all_protein.add(info.split(";")[0])
        all_transcript.add([x for x in info.split(" ") if x.startswith('transcript:')][0].split(':')[-1])

In [54]:
print("Number of peptides: {}".format(len(all_peptide)))
print("Number of gene_symbols: {}".format(len(all_gene_symbol)))
print("Number of transcript ids: {}".format(len(all_transcript)))
print("Number of protein seqs: {}".format(len(all_protein)))

Number of peptides: 4766
Number of gene_symbols: 2385
Number of transcript ids: 10284
Number of protein seqs: 8078


### Novel Peptides

In [55]:
fail_peps = all_res_peps - all_peptide

In [56]:
len(fail_peps)

225

In [57]:
df_fail_peps = final_pep_gene_symbol[final_pep_gene_symbol["peptide"].isin(fail_peps)]
df_fail_peps

Unnamed: 0,peptide,gene_symbol
10,QQPAPAVPHQEQAPLLFPR,CDC42BPG
44,NQVVLTMTNMDPVDTVSYCCARI,IGHV2-70D
45,QAMQPLLGSGAELVGAR,HOXA13
49,FFTGVMEPQLMQTSEMGILTSTALLHR,FAM160B1
105,TSDYEQSETSKPALAQPASAKPVER,NEB
...,...,...
4801,SAVQGPPDRDLCGCYSVSSVLSGCAEPWNHGK,IGHA2
4849,LLVIDLMNDSSGNQTEK,SAGE1
4881,RENMDFSISISNITPADAGTYYCVK,SIRPA
4932,RPPGPASQSLWGNPTR,SKIV2L


In [58]:
df_fail_peps_no_gene_symbol = df_fail_peps[df_fail_peps["gene_symbol"].isna()]
df_fail_peps_no_gene_symbol

Unnamed: 0,peptide,gene_symbol
115,NDNIPEQDSLGLSNLQK,
186,VLIYGASSLQSGVPSR,
312,NQVVLTMTNMNPVDTATYYCAR,
342,LSCAVSGFTFSSYAMHWVR,
363,SSPVFQIPKNDNIPEQDSLGLSNLQK,
417,MSKDALNLVQMQEQTLQLEQQSK,
575,LSCAASGFTFSTYGMHWVR,
595,SKANGGTTDYAAPVK,
633,LEEECEGREPGLETGTQAADCK,
737,ASQSISTWLAWYQQKPGK,


In [59]:
len(set(df_fail_peps_no_gene_symbol["peptide"]))

46

In [60]:
auto = ahocorasick.Automaton()

for seq in fail_peps:
    auto.add_word(seq, seq)

auto.make_automaton()

In [61]:
peps_prots = {}
for prot_file in tqdm(glob('protein_db/GCA_peps/*.fa')):
    prot_seqs = set()
    prots_dict = {}
    parsed_file = SeqIO.parse(prot_file, 'fasta')
    sample_name = prot_file.split('/')[-1].split('-')[1]

    for record in parsed_file:
        try:
            prots_dict[str(record.seq)].append(sample_name + ' ' +  str(record.id) + ' ' +  str(record.seq) + ' ' + str(record.description))
        except KeyError:
            prots_dict[str(record.seq)] = [sample_name + ' ' + str(record.id) + ' ' +  str(record.seq) + ' ' + str(record.description)]

    #get non-canonical peptides that are found in canonical proteins from ensembl
    found_prots = set()

    for prot_seq in prots_dict.keys():
        for end_ind, found in auto.iter(prot_seq):
            found_prots.add(found)
            try:
                peps_prots[found].append(prots_dict[prot_seq])
            except KeyError:
                peps_prots[found] = [prots_dict[prot_seq]]

100%|██████████| 97/97 [02:55<00:00,  1.81s/it]


In [62]:
sample_peps= {}
gene_ID_peps = {}
protein_ID_peps = {}
protein_Seq_peps = {}
transcript_ID_peps = {}
gene_symbol_peps = {}
peps_infos = {}
res_infos = {}
for pep in tqdm(fail_peps):
    prots = [x.split(' ') for xs in peps_prots[pep] for x in xs]
    samples, proteins, proteins_seq, genes, gene_symbols, transcripts, gene_biotypes = set(), set(), set(), set(), set(), set(), set()

    for prot in prots:
        samples.add(prot[0])
        proteins.add(prot[1])
        proteins_seq.add(prot[2])
        genes.add([x for x in prot if x.startswith('gene:')][0].split(':')[-1])
        transcripts.add([x for x in prot if x.startswith('transcript:')][0].split(':')[-1])
        gene_biotypes.add([x for x in prot if x.startswith('gene_biotype:')][0].split(':')[-1])

        if res_infos.get(prot[2]):
            res_infos[prot[2]][0].add(pep)
            res_infos[prot[2]][1].add(prot[0])
        else:
            res_infos[prot[2]] = [{pep},{prot[0]},set()]

        try:
            gene_symbols.add([x for x in prot if x.startswith('gene_symbol:')][0].split(':')[-1])
            res_infos[prot[2]][2].add([x for x in prot if x.startswith('gene_symbol:')][0].split(':')[-1])
        except IndexError:
            pass

    peps_infos[pep] = [','.join(gene_symbols), ','.join(gene_biotypes), ','.join(samples), ','.join(proteins), ','.join(proteins_seq), ','.join(genes), ','.join(transcripts)]

    for sample in samples:
        try:
            sample_peps[sample].add(pep)
        except KeyError:
            sample_peps[sample] = set()
            sample_peps[sample].add(pep)

    for gene in genes:
        try:
            gene_ID_peps[gene].add(pep)
        except KeyError:
            gene_ID_peps[gene] = set()
            gene_ID_peps[gene].add(pep)

    for prot_id in proteins:
        try:
            protein_ID_peps[prot_id].add(pep)
        except KeyError:
            protein_ID_peps[prot_id] = set()
            protein_ID_peps[prot_id].add(pep)

    for prot_seq in proteins_seq:
        try:
            protein_Seq_peps[prot_seq].add(pep)
        except KeyError:
            protein_Seq_peps[prot_seq] = set()
            protein_Seq_peps[prot_seq].add(pep)

    for transcript in transcripts:
        try:
            transcript_ID_peps[transcript].add(pep)
        except KeyError:
            transcript_ID_peps[transcript] = set()
            transcript_ID_peps[transcript].add(pep)

    for gene_symbol in gene_symbols:
        try:
            gene_symbol_peps[gene_symbol].add(pep)
        except KeyError:
            gene_symbol_peps[gene_symbol] = set()
            gene_symbol_peps[gene_symbol].add(pep)

100%|██████████| 225/225 [00:00<00:00, 1423.36it/s]


#### peptide_gca_proteins.tsv

In [63]:
peptide_gca_proteins = pd.DataFrame.from_dict(res_infos, orient='index')
peptide_gca_proteins.columns = ["matching_peptides","sample names","gene_symbol of matching protein"]
peptide_gca_proteins.index.name = "matching_protein_sequence"

# peptide_gca_proteins["peptides_nums"] =  peptide_gca_proteins["matching_peptides"].apply(lambda x: len(x))
# peptide_gca_proteins["sample_nums"] =  peptide_gca_proteins["sample names"].apply(lambda x: len(x))
# peptide_gca_proteins["gene_symbol of matching protein_nums"] =  peptide_gca_proteins["gene_symbol of matching protein"].apply(lambda x: len(x))
peptide_gca_proteins["matching_peptides"] =  peptide_gca_proteins["matching_peptides"].apply(lambda x: ",".join(x))
peptide_gca_proteins["sample names"] =  peptide_gca_proteins["sample names"].apply(lambda x: ",".join(x))
peptide_gca_proteins["gene_symbol of matching protein"] =  peptide_gca_proteins["gene_symbol of matching protein"].apply(lambda x: ",".join(x))

In [64]:
peptide_gca_proteins

Unnamed: 0_level_0,matching_peptides,sample names,gene_symbol of matching protein
matching_protein_sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MKVEACTWAGYSGLYEAIESWDFKKIEKLEEYRLLLKRLQPEFKTRIIPTDIISDLSECLINQECEEILQICSTKGMMAGAEKLVECLLRSDKENWPKTLKLALEKERNKFSELWIVEKGIKDVETEDLEDKMETSDIQIFYQEDPECQNLSENSCPPSEVSDTNLYSPFKPRNYQLELALPAMKGKNTIICAPTGCGKTFVSLLICEHHLKKFPQGQKGKVVFFANQIPVYEQQKSVFSKYFERHGYRVTGISGATAENVPVEQIVENNDIIILTPQILVNNLKKGTIPSLSIFTLMIFDECHNTSKQHPYNMIMFNYLDQKLGGSSGPLPQVIGLTASVGVGDAKNTDEALDYICKLCASLDASVIATVKHNLEELEQVVYKPQKFFRKVESRISDKFKYIIAQLMRDTESLAKRICKDLENLSQIQNREFGTQKYEQWIVTVQKACMVFQMPDKDEESRICKALFLYTSHLRKYNDALIISEHARMKDALDYLKDFFSNVRAAGFEEIEQDLTQRFEEKLQELESVSRDPSNENPKLEDLCFILQEEYHLNPETITILFVKTRALVDALKNWIEGNPKLSFLKPGILTGRGKTNQNTGMTLPAQKCILDAFKASGDHNILIATSVADEGIDIAQCNLVILYEYVGNVIKMIQTRGRGRARGSKCFLLTSNAGVIEKEQINMYKEKMMNDSILRLQTWDEAVFREKILHIQTHEKFIRDSQEKPKPVPDKENKKLLCRKCKALACYTADVRVIEECHYTVLGDAFKECFVSRPHPKPKQFSSFEKRAKIFCARQNCSHDWGIHVKYKTFEIPVIKIESFVVEDIATGVQTLYSKWKDFHFEKIPFDPAEMSK,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58
MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREEEVQYIQAEKNNKGPMEAATLFLKFLLELQEEGWFRGFLDALDHAGYSGLYEAIESWDFKKIEKLEEYRLLLKRLQPEFKTRIIPTDIISDLSECLINQECEEILQICSTKGMMAGAEKLVECLLRSDKENWPKTLKLALEKERNKFSELWIVEKGIKDVETEDLEDKMETSDIQIFYQEDPECQNLSENSCPPSEVSDTNLYSPFKPRNYQLELALPAMKGKNTIICAPTGCGKTFVSLLICEHHLKKFPQGQKGKVVFFANQIPVYEQQKSVFSKYFERHGYRVTGISGATAENVPVEQIVENNDIIILTPQILVNNLKKGTIPSLSIFTLMIFDECHNTSKQHPYNMIMFNYLDQKLGGSSGPLPQVIGLTASVGVGDAKNTDEALDYICKLCASLDASVIATVKHNLEELEQVVYKPQKFFRKVESRISDKFKYIIAQLMRDTESLAKRICKDLENLSQIQNREFGTQKYEQWIVTVQKACMVFQMPDKDEESRICKALFLYTSHLRKYNDALIISEHARMKDALDYLKDFFSNVRAAGFEEIEQDLTQRFEEKLQELESVSRDPSNENPKLEDLCFILQEEYHLNPETITILFVKTRALVDALKNWIEGNPKLSFLKPGILTGRGKTNQNTGMTLPAQKCILDAFKASGDHNILIATSVADEGIDIAQCNLVILYEYVGNVIKMIQTRGRGRARGSKCFLLTSNAGVIEKEQINMYKEKMMNDSILRLQTWDEAVFREKILHIQTHEKFIRDSQEKPKPVPDKENKKLLCRKCKALACYTADVRVIEECHYTVLGDAFKECFVSRPHPKPKQFSSFEKRAKIFCARQNCSHDWGIHVKYKTFEIPVIKIESFVVEDIATGVQTLYSKWKDFHFEKIPFDPAEMSK,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58
MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREGYSGLYEAIESWDFKKIEKLEEYRLLLKRLQPEFKTRIIPTDIISDLSECLINQECEEILQICSTKGMMAGAEKLVECLLRSDKENWPKTLKLALEKERNKFSELWIVEKGIKDVETEDLEDKMETSDIQIFYQEDPECQNLSENSCPPSEVSDTNLYSPFKPRNYQLELALPAMKGKNTIICAPTGCGKTFVSLLICEHHLKKFPQGQKGKVVFFANQIPVYEQQKSVFSKYFERHGYRVTGISGATAENVPVEQIVENNDIIILTPQILVNNLKKGTIPSLSIFTLMIFDECHNTSKQHPYNMIMFNYLDQKLGGSSGPLPQVIGLTASVGVGDAKNTDEALDYICKLCASLDASVIATVKHNLEELEQVVYKPQKFFRKVESRISDKFKYIIAQLMRDTESLAKRICKDLENLSQIQNREFGTQKYEQWIVTVQKACMVFQMPDKDEESRICKALFLYTSHLRKYNDALIISEHARMKDALDYLKDFFSNVRAAGFEEIEQDLTQRFEEKLQELESVSRDPSNENPKLEDLCFILQEEYHLNPETITILFVKTRALVDALKNWIEGNPKLSFLKPGILTGRGKTNQNTGMTLPAQKCILDAFKASGDHNILIATSVADEGIDIAQCNLVILYEYVGNVIKMIQTRGRGRARGSKCFLLTSNAGVIEKEQINMYKEKMMNDSILRLQTWDEAVFREKILHIQTHEKFIRDSQEKPKPVPDKENKKLLCRKCKALACYTADVRVIEECHYTVLGDAFKECFVSRPHPKPKQFSSFEKRAKIFCARQNCSHDWGIHVKYKTFEIPVIKIESFVVEDIATGVQTLYSKWKDFHFEKIPFDPAEMSK,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58
MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIARGPSGVPLRGATVFPSLRTIPVVRASNPAHNGRVCSTWGSFHYKTFDGDVFRFPGLCNYVFSEHCGAAYEDFNIQLRRSQESAAPTLSRVLMKVDGVVIQLTKGSVLVNGHPVLLPFSQSGVLIQQSSSYTKVEARLGLVLMWNHDDSLLLELDTKYANKTCGLCGDFNGMPVVSELLSHNTKLTPMEFGNLQKMDDPTEQCQDPVPEPPRNCSTGFGICEELLHGQLFSGCVALVDVGSYLEACRQDLCFCEDTDLLSCVCHTLAEYSRQCTHAGGLPQDWRGPDFCPQKCPNNMQYHECRSPCADTCSNQEHSRACEDHCVAGCFCPEGTVLDDIGQTGCVPVSKCACVYNGAAYAPGATYSTDCTNCTCSGGRWSCQEVPCPGTCSVLGGAHFSTFDGKQYTVHGDCSYVLTKPCDSSAFTVLAELRRCGLTDSETCLKSVTLSLDGAQTVVVIKASGEVFLNQIYTQLPISAANVTIFRPSTFFIIAQTSLGLQLNLQLVPTMQLFMQLAPKLRGQTCGLCGNFNSIQADDFRTLSGVVEATAAAFFNTFKTQAACPNIRNSFEDPCSLSVENEKYAQHWCSQLTDADGPFGRCHAAVKPGTYYSNCMFDTCNCERSEDCLCAALSSYVHACAAKGVQLGGWRDGVCTKPMTTCPKSMTYHYHVSTCQPTCRSLSEGDITCSVGFIPVDGCICPKGTFLDDTGKCVQASNCPCYHRGSMIPNGESVHDSGAICTCTHGKLSCIGGQAPAPVCAAPMVFFDCRNATPGDTGAGCQKSCHTLDMTCYSPQCVPGCVCPDGLVADGEGGCITAEDCPCVHNEASYRAGQTIRVGCNTCTCDSRMWRCTDDPCLATCAVYGDGHYLTFDGQSYSFNGDCEYTLVQNHCGGKDSTQDSFRVVTENVPCGTTGTTCSKAIKIFLGGFELKLSHGKVEVIGTDESQEVPYTIRQMGIYLVVDTDIGLVLLWDKKTSIFINLSPEFKGRVCGLCGNFDDIAVNDFATRSRSVVGDVLEFGNSWKLSPSCPDALAPKDPCTANPFRKSWAQKQCSILHGPTFAACHAHVEPARYYEACVNDACACDSGGDCECFCTAVAAYAQACHEVGLCVSWRTPSICPLFCDYYNPEGQCEWHYQPCGVPCLRTCRNPRGDCLRDVRGLEGCYPKCPPEAPIFDEDKMQCVATCPTPPLPPRCHVHGKSYRPGAVVPSDKNCQSCLCTERGVECTYKAEACVCTYNGQRFHPGDVIYHTTDGTGGCISARCGANGTIERRVYPCSPTTPVPPTTFSFSTPPLVVSSTHTPSNGPSSAHTGPPSSAWPTTAGTSPRTRLPTASASLPPVCGEKCLWSPWMDVSRPGRGTDSGDFDTLENLRAHGYRVCESPRSVECRAEDAPGVPLRALGQRVQCSPDVGLTCRNREQASGLCYNYQIRVQCCTPLACSTSSSPAQTTPPTTSKTTETRASGSSAPSSTPGTVSLSTARTTPAPGTATSVKKTFSTPSPPPVPATSTSSMSTTAPGTSVVSSKPTPTEPSTSSCLQELCTWTEWIDGSYPAPGINGGDFDTFQNLRDEGYTFCESPRSVQCRAESFPNTPLADLGQDVICSHTEGLICLNKNQLPPICYNYEIRIQCCETVNVCRDITRPPKTVATTRPTPHPTGAQTQTTFTTHMPSASTEQPTATSRGGPTATSVTQGTHTTPVTRNCHPRCTWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAKSHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPRGCHMTSTPGSTSSSPAQTTPSTTSKTTETRASGSSAPSSTPGTVSLSTARTTPAPGTATSVKKTFSTPSPPPVPATSTSSMSTTAPGTSVVSSKPTPTEPSTSSCLQELCTWTEWIDGSYPAPGINGGDFDTFQNLRDEGYTFCESPRSVQCRAESFPNTPLADLGQDVICSHTEGLICLNKNQLPPICYNYEIRIQCCETVNVCRDITRPPKTVATTRPTPHPTGAQTQTTFTTHMPSASTEQPTATSRGGPTATSVTQGTHTTPVTRNCHPRCTWTTWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAKSHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPKGCPVTSTPVTAPSTPSGRAISPTQSTSSWQKSRTTTLVTTSTTSTPQTSTTYAHTTSTTSAPTARTTSAPTTSTTSVPTTSTISGPKTTPSPVPTTSTTSAATTSTISAPTTSTTSVPGTTPSPVLTTSTTSAPTTRTTSASPASTTSGPGNTPSPVPTTSTISAPTTSITSAPTTSTTSAPTSSTTSGPGTTPSPVPTTSITSAPTTSTTSAPTTSTTSARTSSTTSATTTSRISGPETTPSPVPTTSTTSATTTSTTSAPTTSTTSAPTSSTTSSPQTSTTSAPTTSTTSGPGTTPSPVPTTSTTSAPTTRTTSAPKSSTTSAATTSTTSGPETTPRPVPTTSTTSSPTTSTTSAPTTSTTSASTTSTTSGAGTTPSPVPTTSTTSAPTTSTTSAPISSTTSATTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSGPGTTPSAVPTTSITSAPTTSTNSAPISSTTSATTTSRISGPETTPSPVPTASTTSASTTSTTSGPGTTPSPVPTTSTISVPTTSTTSASTTSTTSASTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSASTTSTISAPTTSTTSATTTSTTSAPTPRRTSAPTTSTISASTTSTTSATTTSTTSATTTSTISAPTTSTTLSPTTSTTSTTITSTTSAPISSTTSTPQTSTTSAPTTSTTSGPGTTSSPVPTTSTTSAPTTSTTSAPTTRTTSVPTSSTTSTATTSTTSGPGTTPSPVPTTSTTSAPTTRTTSAPTTSTTSAPTTSTTSAPTSSTTSATTTSTISVPTTSTTSVPGTTPSPVPTTSTISVPTTSTTSASTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSAPTTSTISAPTTSTPSAPTTSTTLAPTTSTTSAPTTSTTSTPTSSTTSTPQTSTTSASTTSITSGPGTTPSPVPTTSTTSAPTTSTTSAATTSTISAPTTSTTSAPTTSTTSASTASKTSGLGTTPSPIPTTSTTSPPTTSTTSASTASKTSGPGTTPSPVPTTSTIFAPRTSTTSASTTSTTPGPGTTPSPVPTTSTASVSKTSTSHVSISKTTHSQPVTRDCHLRCTWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAESHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPKGCPVTSTPVTAPSTPSGRATSPTQSTSSWQKSRTTTLVTTSTTSTPQTSTTSAPTTSTTSAPTTSTTSAPTTSTTSTPQTSISSAPTSSTTSAPTSSTISARTTSIISAPTTSTTSSPTTSTTSATTTSTTSAPTSSTTSTPQTSKTSAATSSTTSSSGTTPSPVTTTSTASVSKTSTSHVSVSKTTHSQPVTRDCHPRCTWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAKSHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPKGCPVTSTSVTAPSTPSGRATSPTQSTSSWQKSRTTTLVTSSITSTTQTSTTSAPTTSTTPASIPSTTSAPTTSTTSAPTTSTTSAPTTSTTSTPQTTTSSAPTSSTTSAPTTSTISAPTTSTISAPTTSTTSAPTASTTSAPTSTSSAPTTNTTSAPTTSTTSAPITSTISAPTTSTTSTPQTSTISSPTTSTTSTPQTSTTSSPTTSTTSAPTTSTTSAPTTSTTSTPQTSISSAPTSSTTSAPTASTISAPTTSTTSFHTTSTTSPPTSSTSSTPQTSKTSAATSSTTSGSGTTPSPVPTTSTASVSKTSTSHVSVSKTTHSQPVTRDCHPRCTWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAESHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPKGCPVTSTPVTAPSTPSGRATSPTQSTSSWQKSRTTTLVTTSTTSTPQTSTTSAPTTSTIPASTPSTTSAPTTSTTSAPTTSMTSAPTHRTTSGPTTSTTLAPTTSTTSAPTTSTNSAPTTSTISASTTSTISAPTTSTISSPTSSTTSTPQTSKTSAATSSTTSGSGTTPSPVPTTSTTSASTTSTTSAPTTSTTSGPGTTPSPVPSTSTTSAATTSTTSAPTTRTTSAPTSSMTSGPGTTPSPVPTTSTTSAPTTSTTSGPGTTPSPVPTTSTTSAPITSTTSGPGSTPSPVPTTSTTSAPTTSTTSASTASTTSGPGSTPSPVPTTSTTSAPTTRTTSASTASTTSGPGSTPSPVPTTSTTSAPTTRTTPASTASTTSGPGTTPSPVPTTSTTSASTTSTISPLTTSTTSAPITSMTSGPGTTPSPVPTTSTTSAPTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSASTASTTSGPGTTPSPVPTTSTTSAPTTSTTSASTASTTSGPGTSLSPVPTTSTTSAPTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSGPGTTPSPVPTTSTTPVSKTSTSHLSVSKTTHSQPVTSDCHPLCAWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAESHPEVNIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPRGCPVTSVTPYGTSPTNALYPSLSTSMVSASVASTSVASSSVASSSVAYSTQTCFCNVADRLYPAGSTIYRHRDLAGHCYYALCSQDCQVVRGVDSDCPSTTLPPAPATSPSISTSEPVTELGCPNAVPPRKKGETWATPNCSEATCEGNNVISLRPRTCPRVEKPTCANGYPAVKVADQDGCCHHYQCQCVCSGWGDPHYITFDGTYYTFLDNCTYVLVQQIVPVYGHFRVLVDNYFCGAEDGLSCPRSIILEYHQDRVVLTRKPVHGVMTNEIIFNNKVVSPGFRKNGIVVSRIGVKMYATIPELGVQVMFSGLIFSVEVPFSKFANNTEGQCGTCTNDRKDECRTPRGTVVASCSEMSGLWNVSIPDQPACHRPHPTPTTVGPTTVGSTTVGPTTVGSTTVGPTTPPAPCLPSPICQLILSKVFEPCHTVIPPLLFYEGCVFDRCHMTDLDVVCSSLELYAALCASHDICIDWRGRTGHMCPFTCPADKVYQPCGPSNPSYCYGNDSASLGALPEAGPITEGCFCPEGMTLFSTSAQVCVPTGCPRCLGPHGEPVKVGHTVGMDCQECTCEAATWTLTCRPKLCPLPPACPLPGFVPVPAAPQAGQCCPQYSCACNTSRCPAPVGCPEGARAIPTYQEGACCPVQNCSWTVCSINGTLYQPGAVVSSSLCETCRCELPGGPPSDAFVVSCETQICNTHCPVGFEYQEQSGQCCGTCVQVACVTNTSKSPAHLFYPGETWSDAGNHCVTHQCEKHQDGLVVVTTKKACPPLSCSLDEARMSKDGCCRFCPPPPPPYQNQSTCAVYHRSLIIQQQGCSSSEPVRLAYCRGNCGDSSSMYSLEGNTVEHRCQCCQELRTSLRNVTLHCTDGSSRAFSYTEVEECGCMGRRCPAPGDTQHSEEAEPEPSQEAESGSWERGVPVSPMH,TTSAPTTSTTSVPTTSTISGPK,"GCA_018504665.1,GCA_018469925.1,GCA_018466835....",MUC5AC
MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIARGPSGVPLRGATVFPSLRTIPVVRASNPAHNGRVCSTWGSFHYKTFDGDVFRFPSLCNYVFSEHCGAAYEDFNIQLRRSQESAAPTLSRVLMKVDGVVIQLTKGSVLVNGHPVLLPFSQSGVLIQQSSSYTKVEARLGLVLMWNHDDSLLLELDTKYANKTCGLCGDFNGMPVVSELLSHNTKLTPMEFGNLQKMDDPTEQCQDPVPEPPRNCSTGFGICEELLHGQLFSGCVALVDVGSYLEACRQDLCFCEDTDLLSCVCHTLAEYSRQCTHAGGLPQDWRGPDFCPQKCPNNMQYHECRSPCADTCSNQEHSRACEDHCVAGCFCPEGTVLDDIGQTGCVPVSKCACVYNGAAYAPGATYSTDCTNCTCSGGRWSCQEVPCPGTCSVLGGAHFSTFDGKQYTVHGDCSYVLTKPCDSSAFTVLAELRRCGLTDSETCLKSVTLSLDGAQTVVVIKASGEVFLNQIYTQLPISAANVTIFRPSTFFIIAQTSLGLQLNLQLVPTMQLFMQLAPKLRGQTCGLCGNFNSIQADDFRTLSGVVEATAAAFFNTFKTQAACPNIRNSFEDPCSLSVENEKYAQHWCSQLTDADGPFGRCHAAVKPGTYYSNCMFDTCNCERSEDCLCAALSSYVHACAAKGVQLGGWRDGVCTKPMTTCPKSMTYHYHVSTCQPTCRSLSEGDITCSVGFIPVDGCICPKGTFLDDTGKCVQASNCPCYHRGSMIPNGESVHDSGAICTCTHGKLSCIGGQAPAPVCAAPMVFFDCRNATPGDTGAGCQKSCHTLDMTCYSPQCVPGCVCPDGLVADGEGGCITAEDCPCVHNEASYRAGQTIRVGCNTCTCDSRMWRCTDDPCLATCAVYGDGHYLTFDGQSYSFNGDCEYTLVQNHCGGKDSTQDSFRVVTENVPCGTTGTTCSKAIKIFLGGFELKLSHGKVEVIGTDESQEVPYTIRQMGIYLVVDTDIGLVLLWDKKTSIFINLSPEFKGRVCGLCGNFDDIAVNDFATRSRSVVGDVLEFGNSWKLSPSCPDALAPKDPCTANPFRKSWAQKQCSILHGPTFAACHAHVEPARYYEACVNDACACDSGGDCECFCTAVAAYAQACHEVGLCVSWRTPSICPLFCDYYNPEGQCEWHYQPCGVPCLRTCRNPRGDCLRDVRGLEGCYPKCPPEAPIFDEDKMQCVATCPTPPLPPRCHVHGKSYRPGAVVPSDKNCQSCLCTERGVECTYKAEACVCTYNGQRFHPGDVIYHTTDGTGGCISARCGANGTIERRVYPCSPTTPVPPTTFSFSTPPLVVSSTHTPSNGPSSAHTGPPSSAWPTTAGTSPRTRLPTASASLPPVCGEKCLWSPWMDVSRPGRGTDSGDFDTLENLRAHGYRVCESPRSVECRAEDAPGVPLRALGQRVQCSPDVGLTCRNREQASGLCYNYQIRVQCCTPLACSTSSSPAQTTPPTTSKTTETRASGSSAPSSTPGTVSLSTARTTPAPGTATSVKKTFSTPSPPPVPATSTSSMSTTAPGTSVVSSKPTPTEPSTSSCLQELCTWTEWIDGSYPAPGINGGDFDTFQNLRDEGYTFCESPRSVQCRAESFPNTPLADLGQDVICSHTEGLICLNKNQLPPICYNYEIRIQCCETVNVCRDITRPPKTVATTRPTPHPTGAQTQTTFTTHMPSASTEQPTATSRGGPTATSVTQGTHTTPVTRNCHPRCTWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAKSHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPRGCHMTSTPGSTSSSPAQTTPSTNSKTTETRASGSSAPSSTPGTVSLSTARTTPAPGTATSVKKTFSTPSPPPVPATSTSSMSTTAPGTSVVSSKPTPTEPSTSSCLQELCTWTEWIDGSYPAPGINGGDFDTFQNLRDEGYTFCESPRSVQCRAESFPNTPLADLGQDVICSHTEGLICLNKNQLPPICYNYEIRIQCCETVNVCRDITRPPKTVATTRPTPHPTGAQTQTTFTTHMPSASTEQPTATSRGGPTATSVTQGTHTTPVTRNCHPRCTWTTWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAKSHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPKGCPVTSTPVTAPSTPSGRAISPTQSTSSWQKSRTTTLVTTSTTSTPQTSTTYAHTTSTTSAPTARTTSAPTTSTTSVPTTSTISGPKTTPSPVPTTSTTSAATTSTISAPTTSTTSVPGTTPSPVLTTSTTSAPTTRTTSASPASTTSGPGNTPSPVPTTSTISAPTTSITSAPTTSTTSAPTSSTTSGPGTTPSPVPTTSITSAPTTSTTSAPTTSTTSARTSSTTSATTTSRISGPETTPSPVPTTSTTSATTTSTTSAPTTSTTSAPTSSTTSSPQTSTTSAPTTSTTSGPGTTPSPVPTTSTTSAPTTRTTSAPKSSTTSAATTSTTSGPETTPRPVPTTSTTSSPTTSTTSAPTTSTTSASTTSTTSGAGTTPSPVPTTSTTSAPTTSTTSAPISSTTSATTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSGPGTTPSAVPTTSITSAPTTSTNSAPISSTTSATTTSRISGPETTPSPVPTASTTSASTTSTTSGPGTTPSPVPTTSTISVPTTSTTSASTTSTTSASTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSASTTSTISAPTTSTTSATTTSTTSAPTPRRTSAPTTSTISASTTSTTSATTTSTTSATTTSTISAPTTSTTLSPTTSTTSTTITSTTSAPISSTTSTPQTSTTSAPTTSTTSGPGTTSSPVPTTSTTSAPTTSTTSAPTTRTTSVPTSSTTSTATTSTTSGPGTTPSPVPTTSTTSAPTTRTTSAPTTSTTSAPTTSTTSAPTSSTTSATTTSTISVPTTSTTSVPGTTPSPVPTTSTISVPTTSTTSASTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSAPTTSTISAPTTSTPSAPTTSTTLAPTTSTTSAPTTSTTSTPTSSTTSTPQTSTTSASTTSITSGPGTTPSPVPTTSTTSAPTTSTTSAATTSTISAPTTSTTSAPTTSTTSASTASKTSGLGTTPSPIPTTSTTSPPTTSTTSASTASKTSGPGTTPSPVPTTSTIFAPRTSTTSASTTSTTPGPGTTPSPVPTTSTASVSKTSTSHVSISKTTHSQPVTRDCHLRCTWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAESHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPKGCPVTSTPVTAPSTPSGRATSPTQSTSSWQKSRTTTLVTTSTTSTPQTSTTSAPTTSTTSAPTTSTTSAPTTSTTSTPQTSISSAPTSSTTSAPTSSTISARTTSIISAPTTSTTSSPTTSTTSATTTSTTSAPTSSTTSTPQTSKTSAATSSTTSSSGTTPSPVTTTSTASVSKTSTSHVSVSKTTHSQPVTRDCHPRCTWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAKSHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPKGCPVTSTSVTAPSTPSGRATSPTQSTSSWQKSRTTTLVTSSITSTTQTSTTSAPTTSTTPASIPSTTSAPTTSTTSAPTTSTTSAPTTSTTSTPQTTTSSAPTSSTTSAPTTSTISAPTTSTISAPTTSTTSAPTASTTSAPTSTSSAPTTNTTSAPTTSTTSAPITSTISAPTTSTTSTPQTSTISSPTTSTTSTPQTSTTSSPTTSTTSAPTTSTTSAPTTSTTSTPQTSISSAPTSSTTSAPTASTISAPTTSTTSFHTTSTTSPPTSSTSSTPQTSKTSAATSSTTSGSGTTPSPVPTTSTASVSKTSTSHVSVSKTTHSQPVTRDCHPRCTWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAESHPEVSIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPKGCPVTSTPVTAPSTPSGRATSPTQSTSSWQKSRTTTLVTTSTTSTPQTSTTSAPTTSTIPASTPSTTSAPTTSTTSAPTTSTTSAPTHRTTSGPTTSTTLAPTTSTTSAPTTSTNSAPTTSTISASTTSTISAPTTSTISSPTSSTTSTPQTSKTSAATSSTTSGSGTTPSPVPTTSTTSASTTSTTSAPTTSTTSGPGTTPSPVPSTSTTSAATTSTTSAPTTRTTSAPTSSMTSGPGTTPSPVPTTSTTSAPTTSTTSGPGTTPSPVPTTSTTSAPITSTTSGPGSTPSPVPTTSTTSAPTTSTTSASTASTTSGPGTTPSPVPTTSTTSAPTTRTTSASTASTTSGPGSTPSPVPTTSTTSAPTTRTTPASTASTTSGPGTTPSPVPTTSTTSASTTSTISPLTTSTTSAPITSMTSGPGTTPSPVPTTSTTSAPTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSASTASTTSGPGTTPSPVPTTSTTSAPTTSTTSASTASTTSGPGTSLSPVPTTSTTSAPTTSTTSGPGTTPSPVPTTSTTSAPTTSTTSGPGTTPSPVPTTSTTPVSKTSTSHLSVSKTTHSQPVTSDCHPLCAWTKWFDVDFPSPGPHGGDKETYNNIIRSGEKICRRPEEITRLQCRAESHPEVNIEHLGQVVQCSREEGLVCRNQDQQGPFKMCLNYEVRVLCCETPRGCPVTSVTPYGTSPTNALYPSLSTSMVSASVASTSVASSSVASSSVAYSTQTCFCNVADRLYPAGSTIYRHRDLAGHCYYALCSQDCQVVRGVDSDCPSTTLPPAPATSPSISTSEPVTELGCPNAVPPRKKGETWATPNCSEATCEGNNVISLRPRTCPRVEKPTCANGYPAVKVADQDGCCHHYQCQCVCSGWGDPHYITFDGTYYTFLDNCTYVLVQQIVPVYGHFRVLVDNYFCGAEDGLSCPRSIILEYHQDRVVLTRKPVHGVMTNEIIFNNKVVSPGFRKNGIVVSRIGVKMYATIPELGVQVMFSGLIFSVEVPFSKFANNTEGQCGTCTNDRKDECRTPRGTVVASCSEMSGLWNVSIPDQPACHRPHPTPTTVGPTTVGSTTVGPTTVGSTTVGPTTPPAPCLPSPICQLILSKVFEPCHTVIPPLLFYEGCVFDRCHMTDLDVVCSSLELYAALCASHDICIDWRGRTGHMCPFTCPADKVYQPCGPSNPSYCYGNDSASLGALPEAGPITEGCFCPEGMTLFSTSAQVCVPTGCPRCLGPHGEPVKVGHTVGMDCQECTCEAATWTLTCRPKLCPLPPACPLPGFVPVPAAPQAGQCCPQYSCACNTSRCPAPVGCPEGARAIPTYQEGACCPVQNCSWTVCSINGTLYQPGAVVSSSLCETCRCELPGGPPSDAFVVSCETQICNTHCPVGFEYQEQSGQCCGTCVQVACVTNTSKSPAHLFYPGETWSDAGNHCVTHQCEKHQDGLVVVTTKKACPPLSCSLDEARMSKDGCCRFCPPPPPPYQNQSTCAVYHRSLIIQQQGCSSSEPVRLAYCRGNCGDSSSMYSLEGNTVEHRCQCCQELRTSLRNVTLHCTDGSSRAFSYTEVEECGCMGRRCPAPGDTQHSEEAEPEPSQEAESGSWERGVPVSPMH,TTSAPTTSTTSVPTTSTISGPK,GCA_018505855.1,MUC5AC
...,...,...,...
MESVEKDWGWGLWRNLRCLGHAHQGKCCTEGGISSTTKTSSENGSCCFEKAARKDHVCRFIGCGRNDRFNYVVMQLQGRNLADLRRSQSRGTFTISTTLRLGRQILESIESIHSVGFLHRDIKPSNFAMGRFPSTCRKCYMLDFGLARQFTNSCGDVRPPRAVAGFRGTVRYASINAHRNREMGRHDDLWSLFYMLVEFVVGQLPWRKIKDKEQVGSIKERYDHRLMLKHLPPEFSIFLDHISSLDYFTKPDYQMSIRVTRKSYKVSTSGPQAFNSLSYTSGPGACISSSSFSRMGSSSFRGGLGAGYGGASGGITTITVNQSLLSPLNLEVDPNIQAVRTQEEKQIKTLNNKFFSFIDKVRFLEQQNKMLETKWSLVQQQKMARSNMDNMFESYINNLKWQLETLGQEKLKLEAELGNMHGLVEDFRNKYEVEISKCTEMENEFVLIKEYVDEAYMNKMELESSLKEVTAKISFLRQLYEEEIGSCSPRSRIHLWCCSWTTAASWTWTASSLRSRPEGFPGGCHADTKQRGELAIKDANAKLSELEAALQLASQDMARQLREYQELMNVKLALDIKIATYRKLLEGEESWLESGMQNMSIHMKTTSSYAGGQSLAYGGLTSPGLSYGLGSSFGSGMGSSSFSHTSSSRAVVMKKIETRDGKLVSESSNVLPNLRITATLLTSVFDNSIKTFGVIESDPFDWEKTGNDGSLTTTTTSTTPQLHTRLTPAAIGIANATPIPGDLLRENTDEVFPDEQLSDGENGIPVGVSPDKLPGSLGHPRPQEKDVWEEMDANKNKIKLGICKAATEEENSHGQANGLLNAPSLGSPIRVRSEITQPDRDIPLVRKLRSIHSFELEKRLTLEPKPDTDKFLETCLEKMQKDTSAGKESILPALLHKPCVPAVSRTDHIWHYDEEYLPDASKPASANTPEQADGGGSNGFIAVNLSSCKQEIDSKEWVIVDKEQDLQDFRTNEAVGHKTTGSPSDEEPEVLQVLEASPQDEKLQLGPWAENDHLKKETSGVVLALSAEGPPTAASEQYTDRLELQPGAASQFIAATPTSLMEAQAEGPLTAITIPRPSVASTQSTSGSFHCGQQPEKKDLQPMEPTVELYSPRENFSGLVVTEGEPPSGGSRTDLGLQIDHIGHDMLPNIRESNKSQDLGPKELPDHNRLVVREFENLPGETEEKSILLESDNEDEKLSRGQHCIEISSLPGDLVIVEKDHSATTEPLDVTKTQTFSVVPNQDKNNEIMKLLTVGTSEISSRDIDPHVEGQIGQVAEMQKNKISKDDDIMSEDLPGHQGDLSTFLHQEGKREKITPRNGELFHCVSENEHGAPTRKDMVRSSFVTRHSRIPVLAQEIDSTLESSSPVSAKEKLLQKKAYQPDLVKLLVEKRQFKSFLGDLSSASDKLLEEKLATVPAPFCEEEVLTPFSRLTVDSHLSRSAEDSFLSPIISQSRKSKIPRPVSWVNTDQVNSSTSSQFFPRPPPGKPPTRPGVEARLRRYKVLGSSNSDSDLFSRLAQILQNGSQKPRSTTQCKSPGSPHNPKTPPKSPVVPRRSPSASPRSSSLPRTSSSSPSRAGRPHHDQRSSSPHLGRSKSPPSHSGSSSSRRSCQQEHCKPSKNGLKGSGSLHHHSASTKTPQGKSKPASKLSR,TLNNKFFSFIDKVR,GCA_018504045.1,TTBK2
MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQLRSRAASAAAAPHLLLLPPPPPAAPPPAGACSPLGPRSPPAATATAAASGGLGPAFPGTFCLPSPAPSLLCSLAQPPEAPFVYFKPAAGFFGAGGGGPEPGGAGTPPGAAAAPPSPPPTLLDEVELLDLESVAAWRDEDDYTWLYIGSSKTFTSSEKSLTPLQWCRHVLDNPTPEMEAARRSLCFRLEQGYTSRGSPLSPQSSIDSELSTSELEDDSISMGYKLQDLTDVQIMARLQEESLRQDYASTSASVSRHSSSVSLSSGKKGTCSDQEYDQYSLEDEEEFDHLPPPQPRLPRCSPFQRGIPHSQTFSSIRECRRSPSSQYFPSNNYQQQQYYSPQAQTPDQQPNRTNGDKLRRSMPNLARMPSTTAISSNISSPVTVRNSQSFDSSLHGAGNGISRIQSCIPSPGQLQHRVHSVGHFPVSIRQPLKATAYVSPTVQGSSNMPLSNGLQLYSNTGIPTPNKAAASGIMGRSALPRPSLAINGSNLPRSKIAQPVRSFLQPPKPLSSLSTLRDGNWRDGCY,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1
MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQLRSRAASAAAAPHLLLLPPPPPAAPPPAGACSPLGPRSPPAATATAAASGGLGLGLALGAGGGGGSGSGSGGGSSPAFPGTFCLPSPAPSLLCSLAQPPEAPFVYFKPAAGFFGAGGGGPEPGGAGTPPGAAAAPPSPPPTLLDEVELLDLESVAAWRDEDDYTWLYIGSSKTFTSSEKSLTPLQWCRHVLDNPTPEMEAARRSLCFRLEQGYTSRGSPLSPQSSIDSELSTSELEDDSISMGYKLQDLTDVQIMARLQEESLRQDYASTSASVSRHSSSVSLSSGKKGTCSDQEYDQYSLEDEEEFDHLPPPQPRLPRCSPFQRGIPHSQTFSSIRECRRSPSSQYFPSNNYQQQQYYSPQAQTPDQQPNRTNGDKLRRSMPNLARMPSTTAISSNISSPVTVRNSQSFDSSLHGAGNGISRIQSCIPSPGQLQHRVHSVGHFPVSIRQPLKATAYVSPTVQGSSNMPLSNGLQLYSNTGIPTPNKAAASGIMGRSALPRPSLAINGSNLPRSKIAQPVRSFLQPPKPLSSLSTLRDGNWRDGCY,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1
MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQLRSRAASAAAAPHLLLLPPPPPAAPPPAGACSPLGPRSPPAATATAAASGGLGPAFPGTFCLPSPAPSLLCSLAQPPEAPFVYFKPAAGFFGAGGGGPEPGGAGTPPGAAAAPPSPPPTLLDEVELLDLESVAAWRDEDDYTWLYIGSSKTFTSSEKSLTPLQWCRHVLDNPTPEMEAARRSLCFRLEQGYTSRGSPLSPQSSIDSELSTSELEDDSISMGYKLQDLTDVQIMARLQEESLRQDYASTSASVSRHSSSVSLSSGKKGTCSDQEYDQYSLEDEEEFDHLPPAQPRLPRCSPFQRGIPHSQTFSSIRECRRSPSSQYFPSNNYQQQQYYSPQAQTPDQQPNRTNGDKLRRSMPNLARMPSTTAISSNISSPVTVRNSQSFDSSLHGAGNGISRIQSCIPSPGQLQHRVHSVGHFPVSIRQPLKATAYVSPTVQGSSNMPLSNGLQLYSNTGIPTPNKAAASGIMGRSALPRPSLAINGSNLPRSKIAQPVRSFLQPPKPLSSLSTLRDGNWRDGCY,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,GCA_018471075.1,SLAIN1


In [65]:
peptide_gca_proteins.to_csv("blast_canonical-count-tables/novel_peptides_match_info/peptide_gca_proteins.tsv", header=1, sep="\t")

#### matching_gca_canoical_proteins.tsv

In [66]:
matching_gca_canoical_proteins = pd.read_table("blast_canonical-count-tables/novel_peptides_match_info/peptide_gca_proteins.tsv")

In [67]:
matching_gca_canoical_proteins

Unnamed: 0,matching_protein_sequence,matching_peptides,sample names,gene_symbol of matching protein
0,MKVEACTWAGYSGLYEAIESWDFKKIEKLEEYRLLLKRLQPEFKTR...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58
1,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREEEVQYIQAEKN...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58
2,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREGYSGLYEAIES...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58
3,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,TTSAPTTSTTSVPTTSTISGPK,"GCA_018504665.1,GCA_018469925.1,GCA_018466835....",MUC5AC
4,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,TTSAPTTSTTSVPTTSTISGPK,GCA_018505855.1,MUC5AC
...,...,...,...,...
1201,MESVEKDWGWGLWRNLRCLGHAHQGKCCTEGGISSTTKTSSENGSC...,TLNNKFFSFIDKVR,GCA_018504045.1,TTBK2
1202,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1
1203,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1
1204,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,GCA_018471075.1,SLAIN1


In [68]:
def get_blast_canonical_info(x):
    blast_res = {}
    non_canonical_seq = x["matching_protein_sequence"]

    if gene_symbol_proteins.get(x["gene_symbol of matching protein"]):
        canonical_seqs = gene_symbol_proteins[x["gene_symbol of matching protein"]]

        for canonical_seq in canonical_seqs:
            seq = canonical_seq.split(";")[0]
            alignments = pairwise2.align.globalms(sequenceA = seq,sequenceB = non_canonical_seq, match = 1,mismatch = -1,open = -2,extend = -1,one_alignment_only=True)
            alignment = alignments[0]
            score = alignment.score
            if blast_res:
                for value in blast_res.values():
                    if value.score > score:
                        continue
                    elif value == score:
                        blast_res[canonical_seq] = alignment
                    else:
                        blast_res = {canonical_seq:alignment}
            else:
                blast_res = {canonical_seq:alignment}

        final_res = []
        for key,value in blast_res.items():
            res = []
            i = 0
            j = 0
            for AA1, AA2 in zip(value.seqA, value.seqB):
                i += 1
                j += 1
                if AA1 == AA2:
                    continue
                if AA1 == '-' and AA2 != '-':
                    # insertion
                    res.append(str(i) + "-" + ">" + AA2)
                    i -= 1
                elif AA2 == '-' and AA1 != '-':
                    # deletion
                    res.append(str(i) + AA1 + ">" + "-")
                    j -= 1
                else:
                    # substitution
                    res.append(str(i) + AA1 + ">" + AA2)

            final_res.append({key:res})

        return final_res
    else:
        return ""

In [69]:
matching_gca_canoical_proteins["match_info"] = matching_gca_canoical_proteins.parallel_apply( get_blast_canonical_info,axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=242), Label(value='0 / 242'))), HB…

In [70]:
matching_gca_canoical_proteins

Unnamed: 0,matching_protein_sequence,matching_peptides,sample names,gene_symbol of matching protein,match_info
0,MKVEACTWAGYSGLYEAIESWDFKKIEKLEEYRLLLKRLQPEFKTR...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58,
1,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREEEVQYIQAEKN...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58,
2,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREGYSGLYEAIES...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58,
3,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,TTSAPTTSTTSVPTTSTISGPK,"GCA_018504665.1,GCA_018469925.1,GCA_018466835....",MUC5AC,[{'MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALS...
4,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,TTSAPTTSTTSVPTTSTISGPK,GCA_018505855.1,MUC5AC,[{'MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALS...
...,...,...,...,...,...
1201,MESVEKDWGWGLWRNLRCLGHAHQGKCCTEGGISSTTKTSSENGSC...,TLNNKFFSFIDKVR,GCA_018504045.1,TTBK2,[{'MSGGGEQLDILSVGILVKERWKVLRKIGGGGFGEIYDALDMLT...
1202,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1,[{'MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQN...
1203,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1,[{'MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQN...
1204,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,GCA_018471075.1,SLAIN1,[{'MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQN...


In [71]:
matching_gca_canoical_proteins = matching_gca_canoical_proteins.explode("match_info", ignore_index=True)

In [72]:
matching_gca_canoical_proteins

Unnamed: 0,matching_protein_sequence,matching_peptides,sample names,gene_symbol of matching protein,match_info
0,MKVEACTWAGYSGLYEAIESWDFKKIEKLEEYRLLLKRLQPEFKTR...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58,
1,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREEEVQYIQAEKN...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58,
2,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREGYSGLYEAIES...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58,
3,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,TTSAPTTSTTSVPTTSTISGPK,"GCA_018504665.1,GCA_018469925.1,GCA_018466835....",MUC5AC,{'MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSP...
4,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,TTSAPTTSTTSVPTTSTISGPK,GCA_018505855.1,MUC5AC,{'MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSP...
...,...,...,...,...,...
1201,MESVEKDWGWGLWRNLRCLGHAHQGKCCTEGGISSTTKTSSENGSC...,TLNNKFFSFIDKVR,GCA_018504045.1,TTBK2,{'MSGGGEQLDILSVGILVKERWKVLRKIGGGGFGEIYDALDMLTR...
1202,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1,{'MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNE...
1203,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1,{'MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNE...
1204,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,GCA_018471075.1,SLAIN1,{'MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNE...


In [73]:
matching_gca_canoical_proteins["canonical_protein_seq"] = matching_gca_canoical_proteins["match_info"].apply(lambda x: list(x.keys())[0].split(";")[0] if x and list(x.keys())[0] != "" else None)
matching_gca_canoical_proteins["number_mismatches"] = matching_gca_canoical_proteins["match_info"].apply(lambda x: len(list(x.values())[0]) if x and list(x.values())[0] != "" else None)
matching_gca_canoical_proteins["mismatch_info"] = matching_gca_canoical_proteins["match_info"].apply(lambda x: ",".join(list(x.values())[0]) if x and list(x.values())[0] != "" else None)
matching_gca_canoical_proteins["canonical_protein_ID"] = matching_gca_canoical_proteins["match_info"].apply(lambda x: list(x.keys())[0].split(";")[1].split(" ")[0] if x and list(x.keys())[0] != "" else None)
matching_gca_canoical_proteins["canonical_transcript_ID"] = matching_gca_canoical_proteins["match_info"].apply(lambda x: [x for x in list(x.keys())[0].split(" ") if x.startswith('transcript:')][0].split(':')[-1] if x and list(x.keys())[0] != "" else None)

In [74]:
matching_gca_canoical_proteins = matching_gca_canoical_proteins.drop("match_info", axis=1)

In [75]:
matching_gca_canoical_proteins

Unnamed: 0,matching_protein_sequence,matching_peptides,sample names,gene_symbol of matching protein,canonical_protein_seq,number_mismatches,mismatch_info,canonical_protein_ID,canonical_transcript_ID
0,MKVEACTWAGYSGLYEAIESWDFKKIEKLEEYRLLLKRLQPEFKTR...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58,,,,,
1,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREEEVQYIQAEKN...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58,,,,,
2,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREGYSGLYEAIES...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR","GCA_018469665.1,GCA_018506955.1,GCA_009914755....",DDX58,,,,,
3,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,TTSAPTTSTTSVPTTSTISGPK,"GCA_018504665.1,GCA_018469925.1,GCA_018466835....",MUC5AC,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,102.0,"246D>E,1480P>A,1683L>P,1738L>P,1875Q>R,2238T>I...",ENSP00000516835.1,ENST00000707307.1
4,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,TTSAPTTSTTSVPTTSTISGPK,GCA_018505855.1,MUC5AC,MSVGRRKLALLWALALALACTRHTGHAQDGSSESSYKHHPALSPIA...,102.0,"101G>S,246D>E,1480P>A,1683L>P,1738L>P,1868T>N,...",ENSP00000516835.1,ENST00000707307.1
...,...,...,...,...,...,...,...,...,...
1201,MESVEKDWGWGLWRNLRCLGHAHQGKCCTEGGISSTTKTSSENGSC...,TLNNKFFSFIDKVR,GCA_018504045.1,TTBK2,MSGGGEQLDILSVGILVKERWKVLRKIGGGGFGEIYDALDMLTREN...,483.0,"2S>-,3G>E,4G>S,5G>V,7Q>-,8L>K,10I>-,11L>W,12S>...",ENSP00000267890.6,ENST00000267890.11
1202,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,3.0,"75L>A,76Q>C,77->S",ENSP00000419730.1,ENST00000466548.5
1203,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,"GCA_018503255.1,GCA_018472835.1,GCA_018852605....",SLAIN1,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,3.0,"75L>A,76Q>C,77->S",ENSP00000400921.2,ENST00000418532.6
1204,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,AASAAAAPHLLLLPPPPPAAPPPAGACSPLGPR,GCA_018471075.1,SLAIN1,MMAEQVKCASAGVSSGAGSGPVVNAELEVKKLQELVRKLEKQNEQL...,4.0,"75L>A,76Q>C,77->S,335P>A",ENSP00000419730.1,ENST00000466548.5


In [76]:
matching_gca_canoical_proteins.to_csv("blast_canonical-count-tables/novel_peptides_match_info/matching_gca_canoical_proteins.tsv", header=1, sep="\t",index = None)

#### Novel Peptides Info Count

In [77]:
print("Number of peptides: {}".format(len(fail_peps)))
print("Number of GCA samples matched: {}".format(len(sample_peps)))
print("Number of gene ids matched: {}".format(len(gene_ID_peps)))
print("Number of transcript ids matched: {}".format(len(transcript_ID_peps)))
print("Number of protein ids matched: {}".format(len(protein_ID_peps)))
print("Number of protein seqs matched: {}".format(len(protein_Seq_peps)))
print("Number of gene_symbols matched: {}".format(len(gene_symbol_peps)))

Number of peptides: 225
Number of GCA samples matched: 97
Number of gene ids matched: 4224
Number of transcript ids matched: 9304
Number of protein ids matched: 9304
Number of protein seqs matched: 1206
Number of gene_symbols matched: 133


#### Protein_with_two_peptides

In [78]:
protein_with_two_peptides = {}
for protein_seq, peptide_seqs in protein_Seq_peps.items():
    if len(peptide_seqs) > 1:
        protein_with_two_peptides[protein_seq] = ",".join(peptide_seqs)

In [79]:
len(protein_with_two_peptides)

175

In [80]:
df_protein_with_two_peptides = pd.DataFrame(list(protein_with_two_peptides.items()), columns=['protein_seq', 'peptides'])

In [81]:
df_protein_with_two_peptides.to_csv("blast_canonical-count-tables/novel_peptides_match_info/protein_with_two_peptides.tsv", header=1, sep="\t",index = None)

In [82]:
df_protein_with_two_peptides

Unnamed: 0,protein_seq,peptides
0,MKVEACTWAGYSGLYEAIESWDFKKIEKLEEYRLLLKRLQPEFKTR...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR"
1,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREGYSGLYEAIES...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR"
2,MTTEQRRSLQAFQDYIRKTLDPTYILSYMAPWFREEEVQYIQAEKN...,"AAGFEEIEQDLTQRFEEK,AAGFEEIEQDLTQR"
3,MEPAGPAPGRLGPLLCLLLAASCAWSGVAGEEELQVIQPDKSVSVA...,"RENMDFSISISNITPADAGTYYCVK,ENMDFSISISNITPADAGTY..."
4,MEPAGPAPGRLGPLLCLLLAASCAWSGVAGEEELQVIQPDKSVSVA...,"RENMDFSISISNITPADAGTYYCVK,ENMDFSISISNITPADAGTY..."
...,...,...
170,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,"LPQQTADR,FLDKLPQQTADR"
171,XSTKGPSVFPLAPCSRSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,"CPGKPAQASPSSSSSSSAPELLGGPSVFLFPPKPK,PAQASPSSSS..."
172,XSTKGPSVFPLAPCSRSTSGGTAALGCLVKDYFPEPVTVSWNSGAL...,"CPGKPAQASPSSSSSSSAPELLGGPSVFLFPPKPK,PAQASPSSSS..."
173,ASTKGPSVFPLAPCSRSTSESTAALGCLVKDYFPEPVTVSWNSGAL...,"VDKTVAPPVAGPSVFLFPPKPK,TVAPPVAGPSVFLFPPKPK"


### Add info(format)

In [509]:
peps_merged_vars = pd.read_table("blast_canonical-count-tables/1-2mismatches_peptides_match_info/peps_merged_vars.tsv")
peps_merged_vars = peps_merged_vars.drop_duplicates()

In [510]:
pep_gene_variants_dict = dict()
for index,row in peps_merged_vars.iterrows():
    for gene in row["genes"].split(","):
        for pep in row["overlapping_peptides"].split(","):
            key = gene + "+" + pep
            if pep_gene_variants_dict.get(key):
                pep_gene_variants_dict[key] = pep_gene_variants_dict[key].update(set(row["variants"].split(",")))
            else:
                pep_gene_variants_dict[key] =  set(row["variants"].split(","))

In [511]:
df1 = pd.read_table("PXD010154/deeplc-tables/PXD010154_deeplc_filtered_95perc.csv.gz", sep = "," ,compression="gzip", usecols = ["sequence","usi"])
df2 = pd.read_table("PXD016999/deeplc-tables/PXD016999_deeplc_filtered_95perc.csv.gz", sep = "," ,compression="gzip", usecols = ["sequence","usi"])
df = pd.concat([df1,df2],axis = 0)
df = df.drop_duplicates()

peps_psms_dict = df["sequence"].value_counts()

In [512]:
genes_merged_pep_vars = pd.read_table("blast_canonical-count-tables/1-2mismatches_peptides_match_info/genes_merged_pep_vars.tsv")
genes_merged_pep_vars["all_vars_PSMs_Num"] = ""

In [513]:
for index,row in genes_merged_pep_vars.iterrows():
    vars_psms_dict = dict()
    vars_psms_set = set()
    vars = row["variants"].split(",")
    for peptide in row["peptides"].split(","):
        key = row["gene"] + "+" + peptide     
        for var in vars:
            if pep_gene_variants_dict.get(key):
                if var in pep_gene_variants_dict[key]:
                    if vars_psms_dict.get(var):
                        vars_psms_dict[var] = vars_psms_dict[var] + peps_psms_dict[peptide]
                    else:
                        vars_psms_dict[var] = peps_psms_dict[peptide]
            else:
                print("Not Fount key: " + key)
    for var,num in vars_psms_dict.items():
        vars_psms_set.add(var + " | " + str(num))
    genes_merged_pep_vars.at[index, "all_vars_PSMs_Num"] = vars_psms_set

In [514]:
peptide_gene_symbol_protein_matching_info = pd.read_table("blast_canonical-count-tables/1-2mismatches_peptides_match_info/peptide_gene_symbol_protein_matching_info.tsv")

peptide_gene_symbol_protein_matching_info = peptide_gene_symbol_protein_matching_info[peptide_gene_symbol_protein_matching_info["number_mismatches"]==1]
peptide_gene_symbol_protein_matching_info["mismatch_info_no_position"] = peptide_gene_symbol_protein_matching_info["mismatch_info"].apply(lambda x : re.sub(r'\d+', '', x))
peptide_gene_symbol_protein_matching_info["original_peptide"] = ""

In [515]:
def get_original_peptide(x):
    can_prot = x["canonical_protein_seq"]
    non_peptide = x["peptide"]
    mismatch_info = x["mismatch_info"]
    
    var_position = int(re.findall(r'\d+', mismatch_info)[0])

    AA_can = x["mismatch_info_no_position"][0]
    AA_non = x["mismatch_info_no_position"][2]
    can_peptide = ""
    if AA_can == "-":
        non_prot = can_prot[:var_position-1] + AA_non + can_prot[var_position-1:]
        start_position = non_prot.find(non_peptide)
        if start_position !=-1:
            can_peptide = can_prot[start_position:start_position+len(non_peptide)-1]
    elif AA_non == "-":
        non_prot = can_prot[:var_position-1] + can_prot[var_position:]
        start_position = non_prot.find(non_peptide)
        if start_position !=-1:
            can_peptide = can_prot[start_position:start_position+len(non_peptide)+1]
    else:
        non_prot = can_prot[:var_position-1] + AA_non + can_prot[var_position:]
        start_position = non_prot.find(non_peptide)
        if start_position !=-1:
            can_peptide = can_prot[start_position:start_position+len(non_peptide)]
    return can_peptide

In [516]:
peptide_gene_symbol_protein_matching_info["original_peptide"] = peptide_gene_symbol_protein_matching_info.apply(lambda x : get_original_peptide(x), axis = 1)

peptide_gene_symbol_protein_matching_info = peptide_gene_symbol_protein_matching_info[peptide_gene_symbol_protein_matching_info["original_peptide"]!=""]
peptide_gene_symbol_protein_matching_info = peptide_gene_symbol_protein_matching_info[["peptide","gene_symbol","mismatch_info_no_position","original_peptide"]]
peptide_gene_symbol_protein_matching_info = peptide_gene_symbol_protein_matching_info.drop_duplicates()

peptide_gene_symbol_protein_matching_info["key"] = peptide_gene_symbol_protein_matching_info["peptide"] + "+" + peptide_gene_symbol_protein_matching_info["gene_symbol"] + "+"+ peptide_gene_symbol_protein_matching_info["mismatch_info_no_position"]
pep_gene_var_original_peptide = peptide_gene_symbol_protein_matching_info.set_index('key')['original_peptide'].to_dict()

In [517]:
import pyarrow.parquet as pq
def get_all_seqs_from_parquet(parquet_file1, parquet_file2, parquet_file3):
    table1 = pq.read_table(parquet_file1, columns=["sequence"])
    parquet1 = table1.to_pandas()
    table2 = pq.read_table(parquet_file2, columns=["sequence"])
    parquet2 = table2.to_pandas()
    table3 = pq.read_table(parquet_file3, columns=["sequence"])
    parquet3 = table3.to_pandas()

    from_parquet = pd.concat([parquet1, parquet2, parquet3], axis=0, ignore_index=True)

    return set(from_parquet["sequence"])

In [518]:
seqs = get_all_seqs_from_parquet("PXD010154/sdrf_parquet/PXD010154-1de73bcb-ee3e-4d19-9d8d-da72f11d5fac.psm.parquet",
                                "PXD016999/sdrf_parquet/PXD016999-first-instrument-8b005cd8-d641-4d1e-a085-c92ed045b4da.psm.parquet",
                                "PXD016999/sdrf_parquet/PXD016999-second-instrument-2739df94-8ceb-4033-a8d3-91adba121f3f.psm.parquet")

In [519]:
for index,row in genes_merged_pep_vars.iterrows():
    var_set = set()
    for var in row["all_vars_PSMs_Num"]:
        var_no_PSM = var.split(" | ")[0]
        isInParquet = False
        
        for peptide in row["peptides"].split(","):
            key = peptide + "+" + row["gene"]+ "+" + var_no_PSM
            if pep_gene_var_original_peptide.get(key):
                original_peptide = pep_gene_var_original_peptide[key]
                if original_peptide in seqs:
                    isInParquet = True
        if isInParquet:
            var = var + " | " + "True"
            var_set.add(var)
        else:
            var = var + " | " + "False"
            var_set.add(var)
        
    genes_merged_pep_vars.at[index, "all_vars_PSMs_Num"] = var_set 

In [520]:
genes_merged_pep_vars

Unnamed: 0,gene,num_vars,num_peptides,num_tissues,num_samples,variants,peptides,tissues,samples,all_vars_PSMs_Num
0,COL6A1,1,3,1,20,R>H,"CPDYTCPITFSSPADITILLDGSASVGSHNFDTTKHFAKR,CPDYT...",pituitary hypophysis,"GCA_018504365.1,GCA_018469865.1,GCA_018504665....",{R>H | 6 | True}
1,IGHG2,3,5,51,63,"P>T,S>A,V>M","TPEVTCVVVDVSHEDPEVQFNWYVDGMEVHNAK,TPEVTCVVVDVS...","transverse colon,heart,brain,heart left ventri...","GCA_018505865.1,GCA_018504365.1,GCA_018469705....","{V>M | 2246 | True, S>A | 4 | True, P>T | 2 | ..."
2,SLC38A10,2,5,6,18,"I>-,A>G","DGVIGLNPLPDVQVNDLRGALDAQLRQAAGGALQVVHSR,SLEHSE...","salivary gland,adrenal gland,stomach,testis,pi...","GCA_018504365.1,GCA_009914755.4,GCA_018504055....","{A>G | 1 | False, I>- | 16 | True}"
3,HDGF,1,4,40,16,P>L,"EAATLEVERPLPMEVEKNSTLSEPGSGR,NSTLSEPGSGRGPPQEE...","transverse colon,colon,heart left ventricle,sk...","GCA_018505845.1,GCA_018504375.1,GCA_018469865....",{P>L | 75 | True}
4,SERPINA1,3,6,49,20,"E>V,D>N,E>D","LQHLVNELTHDIITK,GTEAAGAMFLEAIPMSIPPEVKFNKPFVFL...","transverse colon,heart,heart left ventricle,sk...","GCA_018472725.1,GCA_018471535.1,GCA_018506955....","{E>V | 145 | True, E>D | 1332 | True, D>N | 77..."
...,...,...,...,...,...,...,...,...,...,...
2362,DCTN2,1,1,8,1,P>A,LTAVLLAK,"sigmoid colon,aorta,heart atrial appendage,ske...",GCA_018467155.1,{P>A | 1 | True}
2363,ATP10B,1,1,13,1,N>K,EAEASLDK,"vagina,adrenal gland,sigmoid colon,cerebellum,...",GCA_018504655.1,{N>K | 2 | False}
2364,SERPINA5,1,1,22,31,K>E,SSEEELHR,"heart left ventricle,heart atrial appendage,sk...","GCA_018503265.1,GCA_018505855.1,GCA_018505825....",{K>E | 5 | False}
2365,COL10A1,1,1,2,49,M>T,YQTPTGIK,"lung,colon","GCA_018505865.1,GCA_018473315.1,GCA_018469675....",{M>T | 3 | True}


In [521]:
gene_vars_psm = []
for index,row in genes_merged_pep_vars.iterrows():
    for var in row["all_vars_PSMs_Num"]:
        var_no_PSM = var.split(" | ")[0]
        count = var.split(" | ")[1]
        gene_var_psm.append([row["gene"],var_no_PSM, count])

In [522]:
gene_vars_psm_df = pd.DataFrame(gene_var_psm, columns=["gene", "variants","psm_count"])
gene_vars_psm_df.to_csv("blast_canonical-count-tables/1-2mismatches_peptides_match_info/gene_vars_psm_count.tsv",sep = "\t", header = 1, index = None)

In [523]:
max_length = genes_merged_pep_vars['all_vars_PSMs_Num'].apply(len).max()
for i in range(max_length):
    genes_merged_pep_vars[f'var{i+1}_PSMs_Num'] = genes_merged_pep_vars['all_vars_PSMs_Num'].apply(lambda x: list(x)[i] if i < len(x) else None)
genes_merged_pep_vars["all_vars_PSMs_Num"] = genes_merged_pep_vars["all_vars_PSMs_Num"].apply(lambda x: ";".join(x))

In [524]:
genes_merged_pep_vars.to_csv("blast_canonical-count-tables/1-2mismatches_peptides_match_info/genes_merged_pep_vars_add_psms_num.tsv",sep = "\t", header = 1, index = None)