## Spectronaught Output Screening

See the [README.md](https://github.com/bartongroup/Mouse-Candida-Peptide-Overlaps) file for full instructions on setting up and running these notebooks.

**N.B. The 'Candida_Mouse_Peptide_Intersections.ipynb' notebook must be run prior to this one**

This notebook will find any '.xls' tab-delimited files produced by Spectronaut which need to be uploaded into the 'Spectronaut_results' directory. Running the notebook will identify peptides from a dual Mouse/Candida MS proteomics analysis which can not be discriminated between due to shared peptides between the proteomes of the two organisms.

## Library Imports

In [None]:
import os
import sys
import pandas as pd
import dill as pickle
from PepOverlap import *

from Bio import SeqIO
from glob import glob
from yaspin import yaspin

We will start off getting the sequences of the shared records which and storing these as a list of Seq objects...

In [None]:
try:
	shared_peptides=SeqIO.to_dict(SeqIO.parse('data/shared_peptides.fa',format='fasta'))
except FileNotFoundError:
	print('Please first run the Candida_Mouse_Peptide_Intersections notebook to generate peptide data required for this notebook')
	raise
shared_peptides=list(map(lambda x: x.seq, shared_peptides.values()))

Reload the parsed peptide data...

In [None]:
try:
	with open('data/pep_data.pkl','rb') as fh:
		pep_info=pickle.load(fh)
except FileNotFoundError:
	print('Please first run the Candida_Mouse_Peptide_Intersections notebook to generate peptide data required for this notebook')
	raise

Parse the fasta description line to extract gene symbols, descriptions etc...

In [None]:
mouse_prot_info=parse_fasta_desc('data/mouse.fa')
candida_prot_info=parse_fasta_desc('data/candida.fa')

Now iterate through any .xls files found in the `Spectronaut_results` directory

In [None]:
os.makedirs('outputs',exist_ok=True)

results=glob('Spectronaut_results/*.xls')
if len(results)==0:
	print("No results found: Spectronaut result files (.xls) should be uploaded to the \
		'Spectronaut_results' folder before running this cell")
	raise

for result in results:
	file=os.path.basename(result).replace('.xls','')
	df=pd.read_csv(result, sep="\t")

	with yaspin(text=f'Processing {file}...'):
		conflicts=list()
		for index,peprow in df.iterrows():
			# peptide sequences appear as i.e.'_C[Carbamidomethyl (C)]QGTFSPEDNSIK_.2' which need 
			# the surrounding '_' and any modification results found within '[]' removing
			# The clean_peptide function defined within PepOverap.py takes care of this
			peptide=clean_peptide(peprow['EG.PrecursorId'])
			
			# Store the hit data if the peptide it is on our hit-list...
			if peptide in shared_peptides:
				peprow['clean_peptide']=peptide
				conflicts.append(peprow.to_dict())

		# Write out the filtered results just containing shared peptides...
		conflicts_df=pd.DataFrame(conflicts)
		conflicts_df.to_csv(f'spectronaut_results/{file}_pepconflicts.txt',sep="\t",index=False)
		conflicts_df=conflicts_df.drop_duplicates(subset='clean_peptide',keep='first')
		print(f'Results from {file} written to spectronaut_results/{file}_pepconflicts.txt\n')

		# collect additional sequence metadata....
		conflict_summary_df=pd.DataFrame(conflicts_df['clean_peptide'])
		conflict_summary_df=conflict_summary_df.rename(columns={'clean_peptide':'peptide'})
		conflict_summary_df=conflict_summary_df.merge(pep_info.duplicate_df,on='peptide')

		# A peptide may have been seen multiple times so for producing a summary group by the peptide sequence, 
		# so we get one summary result per peptide
		grouped=conflict_summary_df.groupby('peptide')

		unique_peptides=list()
		mouse_df=pep_info.total_dfs['mouse']
		candida_df=pep_info.total_dfs['candida']

		for name, group in grouped:
			mouse_prot=group[group['organism']=='mouse']['protein_id'].values[0]
			candida_prot=group[group['organism']=='candida']['protein_id'].values[0]
			mouse_total_peptides=mouse_df[mouse_df['protein_id']==mouse_prot].count()['protein_id']
			candida_total_peptides=candida_df[candida_df['protein_id']==candida_prot].count()['protein_id']
			
			group_vals={
				'peptide': group['peptide'].values[0],
				'mouse_prot': mouse_prot,
				'mouse_gene': mouse_prot_info[mouse_prot_info['prot_id']==mouse_prot]['gene'].values[0],
				'mouse_desc': mouse_prot_info[mouse_prot_info['prot_id']==mouse_prot]['description'].values[0],
				'mouse_total_peptides': mouse_total_peptides,
				'candida_prot': candida_prot,
				'candida_gene': candida_prot_info[candida_prot_info['prot_id']==candida_prot]['gene'].values[0],
				'candida_desc': candida_prot_info[candida_prot_info['prot_id']==candida_prot]['description'].values[0],
				'candida_total_peptides': candida_total_peptides
			}

			unique_peptides.append(group_vals)

		unique_peptides_df=pd.DataFrame.from_dict(unique_peptides)
		unique_peptides_df.to_csv(f'outputs/{file}_summary.txt',sep="\t",index=False)
		print(f'Peptide summaries from {file} written to outputs/{file}_summary.txt\n')