## Spectronaught Output Screening

Since we now know which peptides are problematic we can use these to check Spectronaught analysis runs to identify any identifications which may be incorrect...

This notebook will find any '.xls' tab-delimited files produced by Spectronaut which need to be placed in the 'Spectronaut_results' directory

## Library Imports

In [1]:
import os
import pandas as pd
import dill as pickle
from PepOverlap import *

from Bio import SeqIO
from glob import glob
from yaspin import yaspin

In [2]:
shared_peptides=SeqIO.to_dict(SeqIO.parse('data/shared_peptides.fa',format='fasta'))
shared_peptides=list(map(lambda x: x.seq, shared_peptides.values()))

with open('data/pep_data.pkl','rb') as fh:
	pep_info=pickle.load(fh)

mouse_prot_info=parse_fasta_desc('data/mouse.fa')
candida_prot_info=parse_fasta_desc('data/candida.fa')

for result in glob('Spectronaut_results/*/*.xls'):
	file=os.path.basename(result).replace('.xls','')
	df=pd.read_csv(result, sep="\t")

	with yaspin(text=f'Processing {file}...'):
		conflicts=list()
		for index,peprow in df.iterrows():
			peptide=clean_peptide(peprow['EG.PrecursorId'])
			
			if peptide in shared_peptides:
				peprow['clean_peptide']=peptide
				conflicts.append(peprow.to_dict())

		conflicts_df=pd.DataFrame(conflicts)
		conflicts_df.to_csv(f'spectronaut_results/{file}_pepconflicts.txt',sep="\t",index=False)
		conflicts_df=conflicts_df.drop_duplicates(subset='clean_peptide',keep='first')

		# add additional sequence metadata....
		conflict_summary_df=pd.DataFrame(conflicts_df['clean_peptide'])
		conflict_summary_df=conflict_summary_df.rename(columns={'clean_peptide':'peptide'})
		conflict_summary_df=conflict_summary_df.merge(pep_info.duplicate_df,on='peptide')
		grouped=conflict_summary_df.groupby('peptide')

		unique_peptides=list()
		mouse_df=pep_info.total_dfs['mouse']
		candida_df=pep_info.total_dfs['candida']

		for name, group in grouped:
			mouse_prot=group[group['organism']=='mouse']['protein_id'].values[0]
			candida_prot=group[group['organism']=='candida']['protein_id'].values[0]
			mouse_total_peptides=mouse_df[mouse_df['protein_id']==mouse_prot].count()['protein_id']
			candida_total_peptides=candida_df[candida_df['protein_id']==candida_prot].count()['protein_id']
			
			group_vals={
				'peptide': group['peptide'].values[0],
				'mouse_prot': mouse_prot,
				'mouse_gene': mouse_prot_info[mouse_prot_info['prot_id']==mouse_prot]['gene'].values[0],
				'mouse_total_peptides': mouse_total_peptides,
				'candida_prot': candida_prot,
				'candida_gene': candida_prot_info[candida_prot_info['prot_id']==candida_prot]['gene'].values[0],
				'candida_total_peptides': candida_total_peptides
			}

			unique_peptides.append(group_vals)

		unique_peptides_df=pd.DataFrame.from_dict(unique_peptides)
		unique_peptides_df.to_csv(f'spectronaut_results/{file}_summary.txt',sep="\t",index=False)

                                                                               