In [1]:
import os
import ftplib
import pandas as pd
from glob import glob
import ahocorasick
from Bio import SeqIO

In [2]:
os.chdir('/mnt/nfs/wangd/project/paper_db/')

In [3]:
filtered_peps_input_file = '/mnt/nfs/wangd/project/10154parquet/res/gca_peptides_for_deeplc_95thperc_observations_ms2pip_by_filtered.tsv'
filtered_peps = [l.strip().split('\t') for l in open(filtered_peps_input_file).readlines()]
header = filtered_peps[0]
peps = filtered_peps[1::]

print(header)
print(len(peps))
pep_seqs = set([x[header.index('sequence_x')] for x in peps])


['sequence_x', 'protein_accessions', 'charge', 'scan_number', 'peptidoform', 'exp_mass_to_charge', 'calc_mass_to_charge', 'seq', 'tr', 'modifications', 'reference_file_name', "'OpenMS:Target-decoy PSM q-value'", 'Posterior error probability', 'sample_id', 'preds_tr', 'error', 'abserror', 'error_percentile', 'PeptideAtlas_observations', 'GPMDB_observations', 'Uniprot_Extended_PE', 'delta_mass', 'usi', 'number_misscleavages', 'pearsonr_B', 'pearsonr_Y', 'dot_product', 'count_B', 'count_Y', 'corrected_pearsonr_B', 'corrected_pearsonr_Y', 'corrected_dot_product', 'total_ions', 'number_peaks', 'signal_to_noise', 'diff_highest_lowest']
8553


In [4]:
#create automator for the peptides
auto = ahocorasick.Automaton()

for seq in pep_seqs:
    auto.add_word(seq, seq)

auto.make_automaton()

In [5]:
#search each protein file and report proteins that contain the peptides
peps_prots = {}
for prot_file in glob('pep_files/*.fa'):
    prot_seqs = set()
    prots_dict = {}
    parsed_file = SeqIO.parse(prot_file, 'fasta')
    sample_name = prot_file.split('/')[-1].split('-')[1]

    for record in parsed_file:
        try:
            prots_dict[str(record.seq)].append(sample_name + ' ' +  str(record.id) + ' ' + str(record.description))
        except KeyError:
            prots_dict[str(record.seq)] = [sample_name + ' ' + str(record.id) + ' ' + str(record.description)]
    print('Total number of unique protein sequences = {} in the fasta file: {}'.format(len(prots_dict.keys()), prot_file))

    #get non-canonical peptides that are found in canonical proteins from ensembl
    found_prots = set()
    
    for prot_seq in prots_dict.keys():
        for end_ind, found in auto.iter(prot_seq):
            found_prots.add(found)
            try:
                peps_prots[found].append(prots_dict[prot_seq])
            except KeyError:
                peps_prots[found] = [prots_dict[prot_seq]]

print('Numner of peptides found in the protein files {} and number of peptides {}'.format(len(peps_prots), len(peps)))


Total number of unique protein sequences = 89491 in the fasta file: pep_files/Homo_sapiens-GCA_009914755.4-2022_07-pep.fa
Total number of unique protein sequences = 89193 in the fasta file: pep_files/Homo_sapiens-GCA_018466835.1-2022_07-pep.fa
Total number of unique protein sequences = 89067 in the fasta file: pep_files/Homo_sapiens-GCA_018466845.1-2022_07-pep.fa
Total number of unique protein sequences = 89141 in the fasta file: pep_files/Homo_sapiens-GCA_018466855.1-2022_07-pep.fa
Total number of unique protein sequences = 89177 in the fasta file: pep_files/Homo_sapiens-GCA_018466985.1-2022_07-pep.fa
Total number of unique protein sequences = 86464 in the fasta file: pep_files/Homo_sapiens-GCA_018467005.1-2022_07-pep.fa
Total number of unique protein sequences = 89069 in the fasta file: pep_files/Homo_sapiens-GCA_018467015.1-2022_07-pep.fa
Total number of unique protein sequences = 89126 in the fasta file: pep_files/Homo_sapiens-GCA_018467155.1-2022_07-pep.fa
Total number of unique p

Total number of unique protein sequences = 89147 in the fasta file: pep_files/Homo_sapiens-GCA_018504055.1-2022_07-pep.fa
Total number of unique protein sequences = 89142 in the fasta file: pep_files/Homo_sapiens-GCA_018504065.1-2022_07-pep.fa
Total number of unique protein sequences = 89059 in the fasta file: pep_files/Homo_sapiens-GCA_018504075.1-2022_07-pep.fa
Total number of unique protein sequences = 89108 in the fasta file: pep_files/Homo_sapiens-GCA_018504085.1-2022_07-pep.fa
Total number of unique protein sequences = 89128 in the fasta file: pep_files/Homo_sapiens-GCA_018504365.1-2022_07-pep.fa
Total number of unique protein sequences = 89088 in the fasta file: pep_files/Homo_sapiens-GCA_018504375.1-2022_07-pep.fa
Total number of unique protein sequences = 89167 in the fasta file: pep_files/Homo_sapiens-GCA_018504625.1-2022_07-pep.fa
Total number of unique protein sequences = 89145 in the fasta file: pep_files/Homo_sapiens-GCA_018504635.1-2022_07-pep.fa
Total number of unique p

#### 1. info_to_set

In [6]:
pep_gene_info = {}
sample_pep_info = {}
gene_pep_info = {}
prot_pep_info = {}
transcripts_pep_info = {}
for pep in set([x[0] for x in filtered_peps[1::]]):
    prots = [x.split(' ') for xs in peps_prots[pep] for x in xs]
    samples, proteins, genes, gene_symbols, transcripts, gene_biotypes = set(), set(), set(), set(), set(), set()
    
    for prot in prots:
        samples.add(prot[0])
        proteins.add(prot[1])
        genes.add([x for x in prot if x.startswith('gene:')][0].split(':')[-1])
        transcripts.add([x for x in prot if x.startswith('transcript:')][0].split(':')[-1])
        gene_biotypes.add([x for x in prot if x.startswith('gene_biotype:')][0].split(':')[-1])
        
        try:
            gene_symbols.add([x for x in prot if x.startswith('gene_symbol:')][0].split(':')[-1])
        except IndexError:
            pass

    for sample in samples:
        try:
            sample_pep_info[sample].add(pep)
        except KeyError:
            sample_pep_info[sample] = set()
            sample_pep_info[sample].add(pep)
    
    for gene in genes:
        try:
            gene_pep_info[gene].add(pep)
        except KeyError:
            gene_pep_info[gene] = set()
            gene_pep_info[gene].add(pep)

    for prot_id in proteins:
        try:
            prot_pep_info[prot_id].add(pep)
        except KeyError:
            prot_pep_info[prot_id] = set()
            prot_pep_info[prot_id].add(pep)
    
    for transcript in transcripts:
        try:
            transcripts_pep_info[transcript].add(pep)
        except KeyError:
            transcripts_pep_info[transcript] = set()
            transcripts_pep_info[transcript].add(pep)
    
    pep_gene_info[pep] = [','.join(gene_symbols), ','.join(gene_biotypes), ','.join(samples), ','.join(proteins), ','.join(genes), ','.join(transcripts)]

In [7]:
with open('peps_all_info_to_set.tsv', 'w') as peps_out:
    
    peps_out.write('\t'.join(header) + '\t' + '\t'.join(['gene_symbols', 'gene_biotypes', 'samples', 'proteins', 'genes', 'transcripts']) + '\n')
    for pep in filtered_peps[1::]:
        try:
            peps_out.write('\t'.join(pep) + '\t' + '\t'.join(pep_gene_info[pep[0]])+'\n')
        except KeyError:
            print('{} not found in any prot file'.format(pep[0]))

In [8]:
def create_info_table(info_dict, index, out_file):
    df = pd.DataFrame({'peps': [','.join(values) for values in info_dict.values()],
                'Count': [len(values) for values in info_dict.values()]},index=info_dict.keys())
    df.index.name = index
    df.to_csv(out_file, header=1, sep="\t")

In [9]:
create_info_table(sample_pep_info,"sample","count/sample_peps.tsv")
create_info_table(gene_pep_info,"gene","count/gene_peps.tsv")
create_info_table(prot_pep_info,"protein","count/protein_peps.tsv")
create_info_table(transcripts_pep_info,"transcripts","count/transcripts_peps.tsv")

In [10]:
res_set = pd.read_table("peps_all_info_to_set.tsv")
res_set.drop_duplicates(["sequence_x"])

def get_count(info_str):
    return len(info_str.split(","))
res_set["samples_count"] = res_set["samples"].apply(get_count)
res_set["proteins_count"] = res_set["proteins"].apply(get_count)
res_set["genes_count"] = res_set["genes"].apply(get_count)
res_set["transcripts_count"] = res_set["transcripts"].apply(get_count)
res_set = res_set[["seq","samples","samples_count","proteins","proteins_count","genes","genes_count","transcripts","transcripts_count"]]
res_set.to_csv("count/peps_others_count.tsv", header=1, sep="\t",index = None)

#### 2. info_to_list

In [11]:
pep_gene_info = {}
for pep in set([x[0] for x in filtered_peps[1::]]):
    prots = [x.split(' ') for xs in peps_prots[pep] for x in xs]
    samples, proteins, genes, gene_symbols, transcripts, gene_biotypes = list(), list(), list(), list(), list(), list()

    for prot in prots:
        samples.append(prot[0])
        proteins.append(prot[1])
        genes.append([x for x in prot if x.startswith('gene:')][0].split(':')[-1])
        transcripts.append([x for x in prot if x.startswith('transcript:')][0].split(':')[-1])
        gene_biotypes.append([x for x in prot if x.startswith('gene_biotype:')][0].split(':')[-1])
        
        try:
            gene_symbols.append([x for x in prot if x.startswith('gene_symbol:')][0].split(':')[-1])
        except IndexError:
            pass
    
    pep_gene_info[pep] = [','.join(gene_symbols), ','.join(gene_biotypes), ','.join(samples), ','.join(proteins), ','.join(genes), ','.join(transcripts)]

In [12]:
with open('peps_all_info_to_list.tsv', 'w') as peps_out:
    
    peps_out.write('\t'.join(header) + '\t' + '\t'.join(['gene_symbols', 'gene_biotypes', 'samples', 'proteins', 'genes', 'transcripts']) + '\n')
    for pep in filtered_peps[1::]:
        try:
            peps_out.write('\t'.join(pep) + '\t' + '\t'.join(pep_gene_info[pep[0]])+'\n')
        except KeyError:
            print('{} not found in any prot file'.format(pep[0]))

#### 3. info_to_str

In [13]:
pep_gene_info = {}
for pep in set([x[0] for x in filtered_peps[1::]]):
    prots = [x.split(' ') for xs in peps_prots[pep] for x in xs]
#     samples, proteins, genes, gene_symbols, transcripts, gene_biotypes = list(), list(), list(), list(), list(), list()
    infos = []
    for prot in prots:
        info = []
        info.append(prot[0])
        info.append(prot[1])
        info.append([x for x in prot if x.startswith('gene:')][0].split(':')[-1])
        info.append([x for x in prot if x.startswith('transcript:')][0].split(':')[-1])
        info.append([x for x in prot if x.startswith('gene_biotype:')][0].split(':')[-1])
        
        try:
            info.append([x for x in prot if x.startswith('gene_symbol:')][0].split(':')[-1])
        except IndexError:
            pass
        infos.append(','.join(info))
    
    pep_gene_info[pep] = ';'.join(infos)

In [14]:
with open('peps_all_info_to_str.tsv', 'w') as peps_out:
    
    peps_out.write('\t'.join(header) + '\t' + 'all_info'+ '\n')
    for pep in filtered_peps[1::]:
        try:
            peps_out.write('\t'.join(pep) + '\t' + pep_gene_info[pep[0]] +'\n')
        except KeyError:
            print('{} not found in any prot file'.format(pep[0]))
