## script for preprocessing data from orbitrap
<!--    # 1) splits the file based on times provided
#    # 2) deletes unpaired compounds for each scan
# (c) by Anya for Ilya 22.June 2023 with edits by Denis

# run by typing in the command line inside the folder with files



#    # python3 preproc_del_unpaired_isox.py 
# or in python notebook using 
#    # %run preproc_del_unpaired_isox.py 

# script reads all .isox files in the current folder as tables
# cuts tables into 4 separate ones
# resets time and scan number for each of the new tables
# writes down the files in the folder named by the original file

# if nesessary:
# adjust cutting times in the def main:
    # df1_split_times
    # df2_split_times
    # df3_split_times
    # df4_split_times  -->

In [2]:
import glob
import os
import pandas as pd

In [3]:
def preproc_isox(path, df1_split_times,df2_split_times,df3_split_times,df4_split_times):

    #read in the file from the specified path
    data_isox = pd.read_table(path, sep='\s+')
    
    #print info
    print('The total length of the file is ' + str(data_isox['time.min'].min())+' min to '+
      str(data_isox['time.min'].max()) + ' min')
    print('____________________________________')
    print('The chosed split of the file is into 4 files \n 1st ' 
        + str(df1_split_times) +
        ' min \n 2nd ' + str(df2_split_times) +
        ' min \n 3rd ' + str(df3_split_times) +
        ' min \n 4th ' + str(df4_split_times) +' min')

    #split the dataframe into 4 based on provided data
    df1 = data_isox[(data_isox['time.min'] > df1_split_times[0]) & (data_isox['time.min'] < df1_split_times[1])] 
    df2 = data_isox[(data_isox['time.min'] > df2_split_times[0]) & (data_isox['time.min'] < df2_split_times[1])]
    df3 = data_isox[(data_isox['time.min'] > df3_split_times[0]) & (data_isox['time.min'] < df3_split_times[1])]
    df4 = data_isox[(data_isox['time.min'] > df4_split_times[0]) & (data_isox['time.min'] < df4_split_times[1])]
    
    # reset times and scan number in the new dataframes
    dfs_list=[df1,df2,df3,df4]
    for df in dfs_list:
        df.loc[:,'time.min']=df['time.min']-df['time.min'].min()+0.01 
        df.loc[:,'scan.no']=df['scan.no']-df['scan.no'].min()+1
    
    return dfs_list

In [4]:
def percent(part, whole):
    try:
        return 100 * float(part) / float(whole)
    except ZeroDivisionError:
        return 0

In [5]:
def find_prob_comp(dfs_list):
    #a function to find a problematic compound
    #dfs_list=[df1,df2,df3,df4]
    i=1
    for df in dfs_list:
        print('___Working on file #'+str(i) )
        i=i+1
        #find problematic compunds
        compounds=df['compound'].unique()
        #print(compounds)
        #print compound if its quantity does not match to number of scans
        prob_comps=[comp for comp in compounds if df[(df['compound']==comp)]['isotopolog'].count()/2 != df['scan.no'].max()]
        #print(prob_comps)
        
        # for each problmatic compound print number of scans and number of compounds   
        for prob_co in prob_comps:
            numm_scans=(df['scan.no'].max())
            num_comp=df[df['compound']==prob_co]['isotopolog'].count()/2
            #calculate percent of missing data
            perc = 100 - percent(num_comp,numm_scans)
            print('Problematic compound: '+str(prob_co) +'; No of scans/compounds: ' +str(numm_scans)+' / '  +str(num_comp) )
            print(f'   missing {perc:.2f} % of the data' )


In [6]:
def delete_unpaired(dfs_list):
    # delete scans of compounds with unpaired isotopologs
    print('Starting to delete unpaired lines')
    dfs_list_new = []
    for df in dfs_list:
        # group by scan number
        pairs = df.groupby(['scan.no', 'compound'])['isotopolog'].size().reset_index(name='counts')

        # data to keep
        pairs_to_keep = pairs[pairs['counts'] == 2].drop(columns='counts')

        # keep only nesessary data in dataframe
        df = df.merge(pairs_to_keep[['scan.no', 'compound']], on=['scan.no', 'compound'], how='inner')

        dfs_list_new.append(df)
        print('Unpaired lines are deleted')
    return dfs_list_new

In [7]:
def save(dfs_list):
    # #write down the dataframes

    print('____________________________________')
    print('Currently saving:')
    
    for df in dfs_list:
        outpath=df['filename'].iloc[0]
        os.makedirs(outpath, exist_ok=True)  
        name=(outpath +'/'+ df['filename'].iloc[0]+'_'+str(df['compound'].min())+'_'+str(df['compound'].max())+'.isox')
        
        print(name)
        df.to_csv(name, sep='\t',float_format='%.3f')
    print('____________________________________')
    print('New files are saved in ./' + outpath +'/')


In [8]:
def main():    
    #path=glob.glob(os.path.join('../data/',"*.isox"))
    paths=glob.glob(os.path.join("94*.isox"))

    # Set split times for the new files
    df1_split_times=[0,120]
    df2_split_times=[120.05,210]
    df3_split_times=[210.05,300]
    df4_split_times=[300.05,330]

    # these lines runs the function
    for path in paths:
        # do not process if folder exists 
        fold_name=path.split('.')[0]
        isExist=os.path.exists('./'+fold_name)
        if not isExist:
            print(f'Folder \'{fold_name}\' does not exist -> start processing')
            #preprocessing
            dfs_list = preproc_isox(path, df1_split_times,df2_split_times,df3_split_times,df4_split_times)
            print('____________________________________')
            print('Current file is ' + path)

            #if del #put a condition depending on the input
            find_prob_comp(dfs_list)
            #delete unpaired scans with unpaired isotopologs
            dfs_list = delete_unpaired(dfs_list)

            #save files
            save(dfs_list)
        else:
            print(f'Folder \'{fold_name}\' already exists')
            continue

In [9]:
if __name__ == "__main__":
    main() 

Folder '94_Cholesterol_Nice-SBL1-85_vs_Nice-Comm-84_125-400' does not exist -> start processing
The total length of the file is 0.011 min to 329.999 min
____________________________________
The chosed split of the file is into 4 files 
 1st [0, 120] min 
 2nd [120.05, 210] min 
 3rd [210.05, 300] min 
 4th [300.05, 330] min
____________________________________
Current file is 94_Cholesterol_Nice-SBL1-85_vs_Nice-Comm-84_125-400.isox
___Working on file #1
___Working on file #2
Problematic compound: 199; No of scans/compounds: 5561 / 5555.0
   missing 0.11 % of the data
Problematic compound: 261; No of scans/compounds: 5561 / 5560.0
   missing 0.02 % of the data
Problematic compound: 271; No of scans/compounds: 5561 / 5556.0
   missing 0.09 % of the data
___Working on file #3
Problematic compound: 301; No of scans/compounds: 3163 / 3161.0
   missing 0.06 % of the data
Problematic compound: 313; No of scans/compounds: 3163 / 3159.5
   missing 0.11 % of the data
Problematic compound: 325; N