In [1]:
import pandas as pd
from glob import glob
import ahocorasick
from Bio import pairwise2,SeqIO
from tqdm import tqdm

In [2]:
final_pep_gene_symbol = pd.read_table("/mnt/nfs/wangd/project/paper_db/count/peps_gene_symbol.tsv")

In [3]:
final_pep_gene_symbol

Unnamed: 0,sequence_x,protein_accessions,peptidoform,gene_symbol
0,AAAAAAAAAPAAAATAATTAATTAATAAQ,GCA_109298,AAAAAAAAAPAAAATAATTAATTAATAAQ,SRP14
1,AAAAPAPPAGPR,GCA_161748,AAAAPAPPAGPR,PLK3
2,AAAGDLGGDHLAFSCDVAKEHDVQNTFEEMEK,GCA_60405,AAAGDLGGDHLAFSC[Carbamidomethyl]DVAKEHDVQNTFEEMEK,CBR4
3,AAAGDLGGDHLAFSCDVAKEHDVQNTFEEMEK,GCA_60405,AAAGDLGGDHLAFSC[Carbamidomethyl]DVAKEHDVQ[Deam...,CBR4
4,AAAGDLGGDHLAFSCDVAKEHDVQNTFEEMEK,GCA_60405,AAAGDLGGDHLAFSC[Carbamidomethyl]DVAKEHDVQ[Deam...,CBR4
...,...,...,...,...
6203,YWCNDGKNPGAVNACHLSCSALLQDNIADAVACAK,GCA_170013,YWC[Carbamidomethyl]NDGKN[Deamidated]PGAVNAC[C...,LYZ
6204,YWCNDGKNPGAVNACHLSCSALLQDNIADAVACAK,GCA_170013,YWC[Carbamidomethyl]NDGKN[Deamidated]PGAVN[Dea...,LYZ
6205,YWCNDGKNPGAVNACHLSCSALLQDNIADAVACAK,GCA_170013,YWC[Carbamidomethyl]NDGKN[Deamidated]PGAVNAC[C...,LYZ
6206,YWMHFCGGSLIHPQWVLTAAHCLGPDVK,"GCA_284206,GCA_284208",YWM[Oxidation]HFC[Carbamidomethyl]GGSLIHPQ[Dea...,TPSAB1


In [4]:
gene_symbol_proteins = {}
for record in SeqIO.parse("/mnt/nfs/wangd/project/paper_db/pep_files/save_tmp/Homo_sapiens-GRCh38_all-pep.fa", 'fasta'):
    gene_symbol_list = [x for x in record.description.split(" ") if x.startswith('gene_symbol:')]
    if gene_symbol_list:
        gene_symbol = gene_symbol_list[0].split(':')[-1]
    else:
        continue
    try:
        gene_symbol_proteins[gene_symbol].append(str(record.seq) + ';' + str(record.description))
    except KeyError:
        gene_symbol_proteins[gene_symbol] = [str(record.seq) + ';' + str(record.description)]

In [5]:
seq_gene_symbol_set = set()
seq_no_gene_symbol_set = set()
for index,row in final_pep_gene_symbol.iterrows():
    try:
        gene_symbols = row["gene_symbol"].split(",")
        for gene_symbol in gene_symbols:
            seq_gene_symbol_set.add(row["sequence_x"] + '+' + gene_symbol)
    except AttributeError:
        seq_no_gene_symbol_set.add(row["sequence_x"])
        
print("There are {} peptides with no corresponding gene symbol found.".format(len(seq_no_gene_symbol_set)))        
for seq in seq_no_gene_symbol_set:
    print(seq)

There are 25 peptides with no corresponding gene symbol found.
ASQSISTWLAWYQQKPGK
LEWMGWINTGNGNTK
SSPVFQIPKNDNIPEQDSLGLSNLQK
LSCAASGFTFSTYGMHWVR
KTPESFLGPNAALVDLDSLVSRPGPTLPGAK
ASGYTYTDYYMHWVR
SKANGGTTDYAAPVK
ASGYTFTNNGITWVR
TPESFLGPNAALVDLDSLVSRPGPTLPGAK
LEEECEGREPGLETGTQAADCKDAPLK
DLCPLLSEHGLQCSLEPHLNSNLCVYCCK
YKSDSDNQQGSGVPSR
LLIFGASTR
ESGPVLVKPTETLTLTCTISGFSLSNAR
TGEVVLTQSPGTLSLSPGER
NQVVLTMTNMNPVDTATYYCAR
SAGGGTYYADSVK
DTSISTAYMDLSR
LEEECEGREPGLETGTQAADCK
LSCAVSGFTFSSYAMHWVR
NDNIPEQDSLGLSNLQK
VLIYGASSLQSGVPSR
ANGGTTDYAAPVK
NTLYHQMNSLRAEDTAVYYCAR
AEDTAVYFCAK


In [6]:
blast_infos = dict()
gene_symbol_no_protein_set = set()
for seq_gene_symbol in tqdm(seq_gene_symbol_set):
    pep_seq = seq_gene_symbol.split("+")[0]
    gene_symbol = seq_gene_symbol.split("+")[1]
    if gene_symbol in gene_symbol_proteins.keys():
        for info in gene_symbol_proteins[gene_symbol]:
            prot_seq = info.split(";")[0]
            prot_info = info.split(";")[1]

            alignments_score = pairwise2.align.localms(sequenceA = prot_seq,sequenceB = pep_seq,match = 1,mismatch = -1,open = -1,extend = -1,score_only = True)
            if alignments_score >= len(pep_seq) - 4:
                try:
                    blast_infos[seq_gene_symbol].append(info)
                except KeyError:
                    blast_infos[seq_gene_symbol] = [info]
    else:
        gene_symbol_no_protein_set.add(gene_symbol)
print("There are {} gene_symbols with no corresponding canonical protein sequence found.".format(len(gene_symbol_no_protein_set)))        
for gene_symbol in gene_symbol_no_protein_set:
    print(gene_symbol)

100%|██████████| 3537/3537 [00:03<00:00, 888.23it/s] 

There are 14 gene_symbols with no corresponding canonical protein sequence found.
H2BS1
FAM207A
FAM160B2
DDX58
WDR92
CBWD2
apol1
srpk1a
Pwp2
SKIV2L
UHRF1BP1
EEF1AKNMT
FAM160B1
GBA





In [7]:
peptide_gene_symbol_protein = dict()
for key, value in blast_infos.items():
    prot_set = set()
    
    pep_seq = key.split("+")[0]
    gene_symbol = key.split("+")[1]
    
    for info in value:
        prot_set.add(info.split(";")[0])

    if peptide_gene_symbol_protein.get(pep_seq):
        peptide_gene_symbol_protein[pep_seq][gene_symbol] = prot_set
    else:
        peptide_gene_symbol_protein[pep_seq] = {gene_symbol:prot_set}

In [8]:
df = pd.DataFrame([(peptide, gene_symbol, ",".join(protein)) for peptide, inner_dict in peptide_gene_symbol_protein.items() for gene_symbol, protein in inner_dict.items()], 
                  columns=['peptide', 'gene_symbol', 'protein'])
df.to_csv("/mnt/nfs/wangd/project/paper_db/count/blast_canonical_count/peptide_gene_symbol_protein.tsv", header=1, sep="\t", index = None)

In [9]:
df

Unnamed: 0,peptide,gene_symbol,protein
0,ALASYVAACQAAGVVIK,FCGBP,MGALWSWWILWAGATLLWGLTQEASVDLKNTGREEFLTAFLQNYQL...
1,YILQGVTSWGLGCACPNKPGVYAR,LPA,MEHKEVVLLLLLFLKSAAPEQSHVVQDCYHGDGQSYRGTYSTTVTG...
2,TADGEFMAVGAIEPHFYELLIK,AMACR,MALQGISVVELSGLAPGPFCAMVLADFGARVVRVDRPGSRYDVSRL...
3,GIYNQEENVR,HLA-DRB5,MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...
4,IGTTEVEKPAGLLFQQPDLDSALQIAR,TACC2,MGNENSTSDNQRTLSAQTPRSAQPPGNSQNIKRKQQDTPGSPDHRD...
...,...,...,...
3450,YIYNRQEYAR,HLA-DPB1,MMVLQVSAAPRTVALTALLMVLLTSVVQGRATPENYLFQGRQECYA...
3451,LSEGPVPEGAGLK,AHNAK2,MCDCFHMVLPTWPGTPGSVSGRQLQPGEPGAETEDDHSVTEGPADE...
3452,NVVFIDMGHSAYQVSVCAFNKGK,HSPA4L,MKPILLFMERACISLGSRTRAIGNAAKSQIVTNVRNTIHGFKKLHG...
3453,SMIYKDILLLGNNYVIHR,HNF4G,MNTTDNGVNCLCAICGDRATGKHYGASSCDGCKGFFRRSIRKSHVY...


In [10]:
protein_gene_symbol_peptide = dict()
for key, value in blast_infos.items():
    pep_seq = key.split("+")[0]
    gene_symbol = key.split("+")[1]
    
    for info in value:
        prot_seq = info.split(";")[0]
        
        if protein_gene_symbol_peptide.get(prot_seq):
            if protein_gene_symbol_peptide.get(prot_seq).get(gene_symbol):
                protein_gene_symbol_peptide[prot_seq][gene_symbol].add(pep_seq)
            else:
                protein_gene_symbol_peptide[prot_seq][gene_symbol] = {pep_seq}
        else:
            protein_gene_symbol_peptide[prot_seq] = {gene_symbol:{pep_seq}} 

In [11]:
df = pd.DataFrame([(protein, gene_symbol, ",".join(peptide)) for protein, inner_dict in protein_gene_symbol_peptide.items() for gene_symbol, peptide in inner_dict.items()], 
                  columns=['protein', 'gene_symbol', 'peptide'])
df.to_csv("/mnt/nfs/wangd/project/paper_db/count/blast_canonical_count/protein_gene_symbol_peptide.tsv", header=1, sep="\t", index = None)

In [12]:
df

Unnamed: 0,protein,gene_symbol,peptide
0,MGALWSWWILWAGATLLWGLTQEASVDLKNTGREEFLTAFLQNYQL...,FCGBP,"ALASYVAACQAAGVVIKDWR,GCVLDVCMGGGDHDILCK,KFDFQG..."
1,MGALWSWWILWAGATLLWGLTQEASVDLKNTGREEFLTAFLQNYQL...,FCGBP,"ALASYVAACQAAGVVIKDWR,GCVLDVCMGGGDHDILCK,KFDFQG..."
2,MEHKEVVLLLLLFLKSAAPEQSHVVQDCYHGDGQSYRGTYSTTVTG...,LPA,"YILQGVTSWGLGCACPNKPGVYAR,DKYILQGVTSWGLGCACPNKP..."
3,MALQGISVVELSGLAPGPFCAMVLADFGARVVRVDRPGSRYDVSRL...,AMACR,"TADGEFMAVGAIEPHFYELLIK,SSLWEAPR"
4,MALQGISVVELSGLAPGPFCAMVLADFGARVVRVDRPGSRYDVSRL...,AMACR,"TADGEFMAVGAIEPHFYELLIK,SSLWEAPR"
...,...,...,...
6544,MGLLTILKKMKQKERELRLLMLGLDNAGKTTILKKFNGEDIDTISP...,ARL2,EALELDSIR
6545,MGKKLDLSKLTDEEAQHVLEVVQRDFDLRRKEEERLEALKGKIKKE...,MLPH,LQGGAGPEPISEER
6546,MNTTDNGVNCLCAICGDRATGKHYGASSCDGCKGFFRRSIRKSHVY...,HNF4G,SMIYKDILLLGNNYVIHR
6547,MMRVSEPILDMDMANYSEVLDPTYTTLEFETMQILYNSSDSSAPET...,HNF4G,SMIYKDILLLGNNYVIHR


In [13]:
gene_symbol_peptide_protein = dict()
for key, value in blast_infos.items():
    prot_set = set()
    
    pep_seq = key.split("+")[0]
    gene_symbol = key.split("+")[1]
    
    for info in value:
        prot_set.add(info.split(";")[0])

    if gene_symbol_peptide_protein.get(gene_symbol):
        gene_symbol_peptide_protein[gene_symbol][pep_seq] = prot_set
    else:
        gene_symbol_peptide_protein[gene_symbol] = {pep_seq:prot_set}

In [14]:
gene_symbol_peptide = dict()
gene_symbol_protein = dict()
for gene_symbol,value in gene_symbol_peptide_protein.items():
    for peptide,protein in value.items():
        if gene_symbol_peptide.get(gene_symbol):
            gene_symbol_peptide[gene_symbol].add(peptide)
        else:
            gene_symbol_peptide[gene_symbol] = {peptide}
        
        if gene_symbol_protein.get(gene_symbol):
            gene_symbol_protein[gene_symbol].union(protein)
        else:
            gene_symbol_protein[gene_symbol] = protein
            
gene_symbol_peptide_str = {key: ','.join(value) for key, value in gene_symbol_peptide.items()}
gene_symbol_protein_str = {key: ','.join(value) for key, value in gene_symbol_protein.items()}

In [16]:
df = pd.DataFrame({'gene_symbol': list(gene_symbol_peptide.keys()),
                   'peptide': list(gene_symbol_peptide_str.values()),
                   'protein': list(gene_symbol_protein_str.values())})
df.to_csv("/mnt/nfs/wangd/project/paper_db/count/blast_canonical_count/gene_symbol_peptide_protein.tsv", header=1, sep="\t", index = None)

In [17]:
df

Unnamed: 0,gene_symbol,peptide,protein
0,FCGBP,"ALASYVAACQAAGVVIKDWR,GCVLDVCMGGGDHDILCK,KFDFQG...",MGALWSWWILWAGATLLWGLTQEASVDLKNTGREEFLTAFLQNYQL...
1,LPA,"YILQGVTSWGLGCACPNKPGVYAR,DKYILQGVTSWGLGCACPNKP...",MEHKEVVLLLLLFLKSAAPEQSHVVQDCYHGDGQSYRGTYSTTVTG...
2,AMACR,"TADGEFMAVGAIEPHFYELLIK,SSLWEAPR",MALQGISVVELSGLAPGPFCAMVLADFGARVVRVDRPGSRYDVSRL...
3,HLA-DRB5,"AVTELGRPDAEYWNSQKDILEDER,PSAEYWNSQK,HNYGAVESFT...",MVCLKLPGGSYMAKLTVTLMVLSSPLALAGDTRPRFLQQDKYECHF...
4,TACC2,"IFEKPVLGALTTPGEK,AQQEQAAHQATLR,IGTTEVEKPAGLLFQ...",MGNENSTSDNQRTLSAQTPRSAQPPGNSQNIKRKQQDTPGSPDHRD...
...,...,...,...
1909,TRIM25,LQELTPSSGDPGEHDPASTHK,MAELCPLAEELSCSICLEPFKEPVTTPCGHNFCGSCLNETWAVQGS...
1910,WDR17,FGGGIGVPTKEER,MAWMTYISNWFEQDDWYEGLQRANMSQVRQVGLLAAGCQPWNKDVC...
1911,ZSCAN18,RPHPEDGDEQSLEGVSSSGDSAGLEAGQGPGADEPGLSR,DKVRPWVVAQYPESCKKAASLVEGLADVLEEPGMLLGSPAGSSSIL...
1912,ARL2,EALELDSIR,MGLLTILKKMKQKERELRLLMLGLDNAGKTTILKKFNGEDIDTISP...


In [18]:
all_peptide = set()
all_gene_symbol = set()
all_protein = set()
all_transcript = set()
for key, value in blast_infos.items(): 
    pep_seq = key.split("+")[0]
    all_peptide.add(pep_seq)
    
    gene_symbol = key.split("+")[1]
    all_gene_symbol.add(gene_symbol)
    
    for info in value:
        all_protein.add(info.split(";")[0])
        all_transcript.add([x for x in info.split(" ") if x.startswith('transcript:')][0].split(':')[-1])

In [19]:
print("Number of peptides: {}".format(len(all_peptide)))
print("Number of gene_symbols: {}".format(len(all_gene_symbol)))
print("Number of transcript ids: {}".format(len(all_transcript)))
print("Number of protein seqs: {}".format(len(all_protein)))

Number of peptides: 3413
Number of gene_symbols: 1914
Number of transcript ids: 8400
Number of protein seqs: 6546
