-
Notifications
You must be signed in to change notification settings - Fork 2
/
standardSearchResultProcessing.py
118 lines (98 loc) · 4.75 KB
/
standardSearchResultProcessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import sys
import re
import argparse
import pandas as pd
## Read Identified Proteins
def readIdentifiedProteinPeptide(filename,sep):
fileDFObj = pd.read_table(filename, sep=sep, keep_default_na=False, na_values=[''])
return fileDFObj
def pepThresholding(prots, pepTh, protRevStr, protContStr):
print("1. prots dim:"+str(prots.shape))
prots=prots[~prots['protein accession'].str.contains(protRevStr)]
print("2. prots dim:"+str(prots.shape))
prots=prots[~prots['protein accession'].str.contains(protContStr)]
print("3. prots dim:"+str(prots.shape))
prots=prots[prots['distinct peptide sequences']>pepTh]
print("4. prots dim:"+str(prots.shape))
return prots
def identifiedProteinList(prots, columnName):
identifiedProts=prots[columnName].tolist()
return identifiedProts
def extractProtIdsFromDescription(prots, sep):
##prots is a list of strings, generally the description column of protein identification file
protIds=[t.partition(sep)[0] for t in prots]
return protIds
def filterPeptide(peptideObj, protIds, pepColumn):
peptideObj['Is decoy']=peptideObj['Is decoy'].astype(str)
#print(peptideObj['Is decoy'])
protIdsEsc=[re.escape(prot+"_") for prot in protIds]
protStr="|".join(protIdsEsc)
##removing decoy hits
peptideObj=peptideObj[~peptideObj['Is decoy'].str.contains("TRUE",case=False)]
##removing PSMs only mapping to Contaminents
peptideObj=peptideObj[~peptideObj[pepColumn].str.contains("^(CONT.*;?)+$")]
filteredPeptideObj = peptideObj[peptideObj[pepColumn].str.contains(protStr)]
return filteredPeptideObj
def filterFasta(fastaFile, overlapping_ids, outFile):
fastahandle = open(fastaFile, "rU")
outputHandle = open(outFile, "w")
#print(overlapping_ids)
overlapping_ids[:] = [line.strip() for line in overlapping_ids]
overlapping_ids[:] = [line.replace('"','') for line in overlapping_ids]
#print("oids:"+str(len(overlapping_ids)))
#print(overlapping_ids[0])
records = list(SeqIO.parse(fastahandle, "fasta"))
#print("Total=")
#print(len(records))
count1=0
count2=0
for record in records: # SeqIO.parse(handle, "fasta")
#print(record.description)
#print("TESTTEST")
if record.description in overlapping_ids:
overlapping_ids.remove(record.description)
count1=count1+1
outputHandle.write(">"+record.description+"\n")
outputHandle.write(str(record.seq)+"\n")
#continue
#print("found:"+record.description)
fastahandle.close()
outputHandle.close()
def writeResult(filteredObj, filename):
filteredObj.to_csv(filename, index=False)
def main(args):
protColName="description"
pepColName="proteinacc_start_stop_pre_post_;"
protObj1=readIdentifiedProteinPeptide(args.protein,",")
pepObj=readIdentifiedProteinPeptide(args.peptide,",")
protObj=pepThresholding(protObj1, pepTh, protRevStr, protContStr)
#temporarily commented
writeResult(protObj, args.protOut)
### Filter ORFs fasta file
identifiedProt=identifiedProteinList(protObj, protColName)
###Filter PeptideObj
##we are getting the ids again because the list has changed in the filterFasta function
identifiedProt=identifiedProteinList(protObj, protColName)
protForPep=extractProtIdsFromDescription(identifiedProt, " ")
fPeptide=filterPeptide(pepObj, protForPep, pepColName)
writeResult(fPeptide, args.pepOut)
filteredPrt=readIdentifiedProteinPeptide(args.protOut,',')
protIds=filteredPrt["description"].tolist()
filterFasta(args.fasta, protIds, args.fastaOut)
protRevStr="XXX_"
protContStr="CONT_"
pepTh=1
parser = argparse.ArgumentParser(description='MSGF+ protein peptide identification filtering for standard proteome search/correcting protein peptide filtering from a previous run')
parser.add_argument("--protein", required=True, help="Identified Protein file name")
parser.add_argument("--peptide", required=True, help="Identified Peptide file name")
parser.add_argument("--protOut", required=True, help="Identified Protein file name")
parser.add_argument("--pepOut", required=True, help="Identified Peptide file name")
parser.add_argument("--fasta", required=True, help="Standard fasta file name")
parser.add_argument("--fastaOut", required=True, help="Standard identified sequence output fasta file name")
args = parser.parse_args()
main(args)
##Human adeno-virus
#proteinFile="/data/SBCS-BessantLab/shyama/Data/Bristol/Human/adenovirus/identification/PASA/human_adeno_mydb_pasa.standard+fdr+th+grouping+prt.csv"
#peptideFile="/data/SBCS-BessantLab/shyama/Data/Bristol/Human/adenovirus/identification/PASA/human_adeno_mydb_pasa.standard+fdr+th+grouping.csv"