# HEK293 identification summary

In [None]:
import os
import sys

src_dir = os.path.abspath('../src')
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [None]:
%matplotlib inline
import collections
import itertools

import pandas as pd
import pyteomics.auxiliary
import tqdm

from ann_solo import reader, spectrum, util

In [None]:
tqdm.tqdm = tqdm.tqdm_notebook

In [None]:
def extract_time_from_log(filename):
    with open(filename, 'r') as f_in:
        for line in f_in:
            if 'user' in line:
                # user time
                usertime = line.split()[1]
                minutes = int(usertime[:usertime.find('m')])
                seconds = float(usertime[usertime.find('m') + 1: usertime.rfind('s')])
                usertime = minutes * 60 + seconds
                # sys time
                line = next(f_in)
                systime = line.split()[1]
                minutes = int(systime[:systime.find('m')])
                seconds = float(systime[systime.find('m') + 1: systime.rfind('s')])
                systime = minutes * 60 + seconds
                
                return usertime + systime

In [None]:
def read_spectrast_psms(filename):
    psms = pd.read_csv(filename, sep='\t', header=0)
    psms['sequence'] = psms['ID'].str.split('/').str[0]
    psms['PSM_ID'] = psms['### Query']
    psms['search_engine_score'] = pd.to_numeric(psms['Dot'])
    psms['charge'] = pd.to_numeric(psms['ID'].str.split('/').str[1])
    psms['mass_diff'] = pd.to_numeric(psms['MzDiff'])
    psms['is_decoy'] = psms['Proteins'].str.contains('DECOY_')
        
    return psms[['sequence', 'PSM_ID', 'search_engine_score', 'charge',
                 'mass_diff', 'is_decoy']].set_index('PSM_ID')


def _get_bin(psm, tol_mass):
    return int(psm['mass_diff'] // tol_mass)


def _filter_fdr(psms, fdr):
    return pyteomics.auxiliary.filter(
        psms, fdr=fdr, key=lambda x: x.search_engine_score, reverse=True,
        is_decoy=lambda x: x.is_decoy, remove_decoy=True, formula=1,
        correction=0, full_output=True)


def filter_group_fdr_spectrast(psms, fdr, tol_mass, min_group_size):
    psms['mass_bin'] = psms.apply(_get_bin, axis=1, args=(tol_mass,))
    mass_bins = psms.groupby('mass_bin').indices
    
    groups_common, groups_uncommon = [], []
    for _, group in mass_bins.items():
        if len(group) >= min_group_size:
            groups_common.append(psms.iloc[group])
        else:
            groups_uncommon.append(psms.iloc[group])
    groups_uncommon = pd.concat(groups_uncommon)
    
    # calculate the FDR combined for all uncommon mass bins
    # and separately for each common mass bin    
    return pd.concat([_filter_fdr(groups_uncommon, fdr),
                      *[_filter_fdr(group, fdr) for group in groups_common]])

In [None]:
max_fdr = 0.01
tol_mass = 0.1
min_group_size = 5

In [None]:
hek293_dir = '../data/processed/hek293'

In [None]:
filename_stats = os.path.join(hek293_dir, 'stats.txt')
filename_summary = os.path.join(hek293_dir, 'summary.txt')
if os.path.isfile(filename_stats) and os.path.isfile(filename_summary):
    stats = pd.read_csv(filename_stats, sep='\t')
    summary = pd.read_csv(filename_summary, sep='\t', index_col=[0, 1])
else:
    num_ids = []
    runtimes = []
    psms = collections.defaultdict(list)
    total = 24 * 2 * 4    # 24 raw files * 2 (IDs & log) * 4 (search engine combos)
    with tqdm.tqdm(desc='Files processed', unit='files', total=total) as pbar:
        for search_engine in ('ann-solo', 'spectrast'):
            for search_mode in ('oms', 'std'):
                for filename in os.listdir(
                        os.path.join(hek293_dir, search_engine, search_mode)):
                    filename_full = os.path.join(hek293_dir, search_engine,
                                                 search_mode, filename)
                    if filename.endswith('.log'):
                        runtimes.append((
                            search_engine, search_mode,
                            os.path.splitext(filename)[0],
                            extract_time_from_log(filename_full)))
                        pbar.update(1)
                    else:
                        if filename.endswith('.mztab'):
                            file_psms = reader.read_mztab_psms(filename_full)
                            pbar.update(1)
                        elif filename.endswith('.txt'):
                            file_psms = filter_group_fdr_spectrast(
                                read_spectrast_psms(filename_full),
                                max_fdr, tol_mass, min_group_size)
                            pbar.update(1)
                        psms[(search_engine, search_mode)].append(file_psms)
                        num_ids.append((search_engine, search_mode,
                                        os.path.splitext(filename)[0],
                                        len(file_psms)))

    num_ids_df = pd.DataFrame.from_records(
        num_ids, columns=['search_engine', 'search_mode', 'filename', 'psms'])
    time_df = pd.DataFrame.from_records(
        runtimes, columns=['search_engine', 'search_mode', 'filename', 'time'])
    stats = (pd.merge(num_ids_df, time_df,
                     on=['search_engine', 'search_mode', 'filename'])
             .sort_values(['search_engine', 'search_mode', 'filename'])
             .reset_index(drop=True))
    
    summary = (stats.groupby(['search_engine', 'search_mode'])
               .agg({'psms': 'sum', 'time': 'mean'}))
    summary['time'] = summary['time'] / 60

    psms_df = []
    for (search_engine, search_mode), psm_list in psms.items():
        num_peptides = len(pd.concat(psm_list)['sequence'].unique())
        psms_df.append((search_engine, search_mode, num_peptides))
    psms_df = pd.DataFrame.from_records(
        psms_df, index=['search_engine', 'search_mode'],
        columns=['search_engine', 'search_mode', 'peptides'])
    summary = summary.join(psms_df)
    
    stats.to_csv(filename_stats, sep='\t', index=False)
    summary.to_csv(filename_summary, sep='\t', index=True)

In [None]:
summary