In [1]:
import os
import sys

src_dir = os.path.abspath('../src')
if src_dir not in sys.path:
    sys.path.append(src_dir)

In [2]:
%matplotlib inline
import collections

import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm

import reader
import util

In [3]:
tqdm.tqdm = tqdm.tqdm_notebook

# plot styling
plt.style.use(['seaborn-white', 'seaborn-paper'])
plt.rc('font', family='serif')
sns.set_palette('Set1')

In [4]:
def extract_time_from_log(filename):
    with open(filename, 'r') as f_in:
        for line in f_in:
            if 'user' in line:
                # user time
                usertime = line.split()[1]
                minutes = int(usertime[:usertime.find('m')])
                seconds = float(usertime[usertime.find('m') + 1: usertime.rfind('s')])
                usertime = minutes * 60 + seconds
                # sys time
                line = next(f_in)
                systime = line.split()[1]
                minutes = int(systime[:systime.find('m')])
                seconds = float(systime[systime.find('m') + 1: systime.rfind('s')])
                systime = minutes * 60 + seconds
                
                return usertime + systime

In [5]:
def read_spectrast_psms(filename):
    psms = pd.read_csv(filename, sep='\t', header=0)
    psms['sequence'] = psms['ID'].str.split('/').str[0]
    psms['PSM_ID'] = psms['### Query']
    psms['search_engine_score[1]'] = pd.to_numeric(psms['Dot'])
    psms['charge'] = pd.to_numeric(psms['ID'].str.split('/').str[1])
    psms['exp_mass_to_charge'] = 0
    psms['calc_mass_to_charge'] = pd.to_numeric(psms['MzDiff'])
    psms['opt_ms_run[1]_cv_MS:1002217_decoy_peptide'] =\
        psms['Proteins'].str.contains('DECOY_')
        
    return psms[['sequence', 'PSM_ID', 'search_engine_score[1]', 'charge',
                 'exp_mass_to_charge', 'calc_mass_to_charge',
                 'opt_ms_run[1]_cv_MS:1002217_decoy_peptide']]

In [6]:
max_fdr = 0.01
tol_mass = 0.1
tol_mode = 'Da'

In [7]:
hek293_dir = '../data/processed/hek293'

In [8]:
filename_stats = os.path.join(hek293_dir, 'stats.txt')
filename_summary = os.path.join(hek293_dir, 'summary.txt')
if os.path.isfile(filename_stats) and os.path.isfile(filename_summary):
    stats = pd.read_csv(filename_stats, sep='\t')
    summary = pd.read_csv(filename_summary, sep='\t', index_col=[0, 1])
else:
    num_ids = []
    runtimes = []
    psms = collections.defaultdict(list)
    total = 24 * 2 * 4    # 24 raw files * 2 (IDs & log) * 4 (search engine combos)
    with tqdm.tqdm(desc='Files processed', unit='files', total=total) as pbar:
        for search_engine in ('ann-solo', 'spectrast'):
            for search_mode in ('oms', 'closed'):
                for filename in os.listdir(
                        os.path.join(hek293_dir, search_engine, search_mode)):
                    filename_full = os.path.join(hek293_dir, search_engine,
                                                 search_mode, filename)
                    if filename.endswith('.log'):
                        runtimes.append((
                            search_engine, search_mode,
                            os.path.splitext(filename)[0],
                            extract_time_from_log(filename_full)))
                        pbar.update(1)
                    else:
                        if filename.endswith('.mztab'):
                            file_psms = util.filter_group_fdr(
                                reader.read_mztab_psms(filename_full),
                                max_fdr, tol_mass, tol_mode)
                            pbar.update(1)
                        elif filename.endswith('.txt'):
                            file_psms = util.filter_group_fdr(
                                read_spectrast_psms(filename_full),
                                max_fdr, tol_mass, tol_mode)
                            pbar.update(1)
                        psms[(search_engine, search_mode)].append(file_psms)
                        num_ids.append((search_engine, search_mode,
                                        os.path.splitext(filename)[0],
                                        len(file_psms)))

    num_ids_df = pd.DataFrame.from_records(
        num_ids, columns=['search_engine', 'search_mode', 'filename', 'psms'])
    time_df = pd.DataFrame.from_records(
        runtimes, columns=['search_engine', 'search_mode', 'filename', 'time'])
    stats = (pd.merge(num_ids_df, time_df,
                     on=['search_engine', 'search_mode', 'filename'])
             .sort_values(['search_engine', 'search_mode', 'filename'])
             .reset_index(drop=True))
    
    summary = (stats.groupby(['search_engine', 'search_mode'])
               .agg({'psms': 'sum', 'time': 'mean'}))
    summary['time'] = summary['time'] / 60

    psms_df = []
    for (search_engine, search_mode), psm_list in psms.items():
        num_peptides = len(pd.concat(psm_list)['sequence'].unique())
        psms_df.append((search_engine, search_mode, num_peptides))
    psms_df = pd.DataFrame.from_records(
        psms_df, index=['search_engine', 'search_mode'],
        columns=['search_engine', 'search_mode', 'peptides'])
    summary = summary.join(psms_df)
    
    stats.to_csv(filename_stats, sep='\t', index=False)
    summary.to_csv(filename_summary, sep='\t', index=True)

In [9]:
summary

Unnamed: 0_level_0,Unnamed: 1_level_0,psms,time,peptides
search_engine,search_mode,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ann-solo,closed,353436,24.730299,113189
ann-solo,oms,711939,129.513128,189028
spectrast,closed,369107,5.205294,111738
spectrast,oms,519646,1276.663253,134905
