# Prepare the data
* download, extract and prepare peaklists

In [1]:
%%bash
mkdir -p casmi2017/data
wget -N -P casmi2017/data http://casmi-contest.org/2017/challenges/challenges-046-243-msms-tsv-20170707.zip
wget -N -P casmi2017/data http://casmi-contest.org/2017/CASMI-solutions.csv
unzip -u -d casmi2017/data casmi2017/data/challenges-046-243-msms-tsv-20170707.zip
rename -f 's/-msms//' casmi2017/data/casmi2017/challenge*.txt
sed 's/\t/ /' -i casmi2017/data/casmi2017/challenge*.txt
mkdir -p casmi2017/peaklists
mv casmi2017/data/casmi2017/challenge*.txt casmi2017/peaklists/

--2022-07-21 11:14:52--  http://casmi-contest.org/2017/challenges/challenges-046-243-msms-tsv-20170707.zip
Auflösen des Hostnamens casmi-contest.org (casmi-contest.org) … 216.105.38.10
Verbindungsaufbau zu casmi-contest.org (casmi-contest.org)|216.105.38.10|:80 … verbunden.
HTTP-Anforderung gesendet, auf Antwort wird gewartet … 200 OK
Länge: 116254 (114K) [application/octet-stream]
Wird in »casmi2017/data/challenges-046-243-msms-tsv-20170707.zip« gespeichert.

     0K .......... .......... .......... .......... .......... 44%  125K 1s
    50K .......... .......... .......... .......... .......... 88%  292K 0s
   100K .......... ...                                        100% 17,9M=0,6s

2022-07-21 11:14:53 (199 KB/s) - »casmi2017/data/challenges-046-243-msms-tsv-20170707.zip« gespeichert [116254/116254]

--2022-07-21 11:14:53--  http://casmi-contest.org/2017/CASMI-solutions.csv
Auflösen des Hostnamens casmi-contest.org (casmi-contest.org) … 216.105.38.10
Verbindungsaufbau zu casmi-cont

Archive:  casmi2017/data/challenges-046-243-msms-tsv-20170707.zip
  inflating: casmi2017/data/casmi2017/challenge-046-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-047-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-048-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-049-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-050-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-051-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-052-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-053-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-054-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-055-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-056-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-057-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-058-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-059-msms.txt  
  inflating: casmi2017/data/casmi2017/challenge-060-

* process experimental settings to create run parameter files

In [2]:
import pandas as pd
import csv
import os
import rdkit.Chem.rdMolDescriptors

outPath='casmi2017/parameters'
if not os.path.exists(outPath):
    os.makedirs(outPath)

parameter = pd.read_csv("casmi2017/data/casmi2017/summary-046-243.csv", sep='\t')
solutions = pd.read_csv("casmi2017/data/CASMI-solutions.csv", sep=',')

#loop over all challenges
for index, row in parameter.iterrows():
    #get the InChIkey part one of the solution
    solution=solutions[(solutions['Challenge'] == row['challengename'])]
    inchikey=solution['InChIkey'].iloc[0].split('-')[0]
    #calculate neutral precursor mass from solution
    mol=rdkit.Chem.MolFromInchi(solution['InChI'].iloc[0])
    exactMass=rdkit.Chem.rdMolDescriptors.CalcExactMolWt(mol)
    #compile the parameters
    content = (
        "PeakListPath = /casmi2017/peaklists/"+row['challengename']+".txt\n"
        "MetFragDatabaseType = LocalCSV\n"
        "LocalDatabasePath = /casmi2017/candidates/"+row['challengename']+".csv\n"
        "NeutralPrecursorMass = "+str(exactMass)+"\n"
        "FragmentPeakMatchAbsoluteMassDeviation = 0.001\n"
        "FragmentPeakMatchRelativeMassDeviation = 5\n"
        "DatabaseSearchRelativeMassDeviation = 5\n"
        "PrecursorIonMode = "+("-1" if row['ION_MODE']=="NEGATIVE" else "1")+"\n"
        "IsPositiveIonMode = "+("False" if row['ION_MODE']=="NEGATIVE" else "True")+"\n"
        "MetFragScoreTypes = FragmenterScore,OfflineIndividualMoNAScore\n"
        "MetFragScoreWeights = 1.0,1.0\n"
        "MetFragCandidateWriter = PSV\n"
        "SampleName = "+row['challengename']+"\n"
        "ResultsPath = /casmi2017/results\n"
        "MaximumTreeDepth = 2\n"
        "MetFragPreProcessingCandidateFilter = UnconnectedCompoundFilter\n"
        "MetFragPostProcessingCandidateFilter = InChIKeyFilter\n"
        "OfflineSpectralDatabaseFile = /casmi2017/spectrallibrary.mb\n"
        "NumberThreads = 1\n"
        "# InChIKey = "+inchikey
        )
    #write parameters
    with open(outPath+"/"+row['challengename']+".txt", "w") as text_file:
        print(content, file=text_file)


* prepare candidate lists

In [3]:
%%bash
wget -N -P casmi2017/data http://casmi-contest.org/2017/candidates-category4-smiles-nonredundant-20170621.zip
unzip -u -d casmi2017/data casmi2017/data/candidates-category4-smiles-nonredundant-20170621.zip
rename -f 's/candidates-//' casmi2017/data/candidates/candidates-challenge*.txt
rename -f 's/-smiles-nonredundant//' casmi2017/data/candidates/challenge*.txt

--2022-07-21 11:15:02--  http://casmi-contest.org/2017/candidates-category4-smiles-nonredundant-20170621.zip
Auflösen des Hostnamens casmi-contest.org (casmi-contest.org) … 216.105.38.10
Verbindungsaufbau zu casmi-contest.org (casmi-contest.org)|216.105.38.10|:80 … verbunden.
HTTP-Anforderung gesendet, auf Antwort wird gewartet … 200 OK
Länge: 19136297 (18M) [application/octet-stream]
Wird in »casmi2017/data/candidates-category4-smiles-nonredundant-20170621.zip« gespeichert.

     0K .......... .......... .......... .......... ..........  0%  146K 2m8s
    50K .......... .......... .......... .......... ..........  0%  283K 97s
   100K .......... .......... .......... .......... ..........  0% 32,1M 64s
   150K .......... .......... .......... .......... ..........  1%  290K 64s
   200K .......... .......... .......... .......... ..........  1% 31,4M 51s
   250K .......... .......... .......... .......... ..........  1%  280K 54s
   300K .......... .......... .......... .......... ....

Archive:  casmi2017/data/candidates-category4-smiles-nonredundant-20170621.zip
  inflating: casmi2017/data/candidates/candidates-challenge-046-smiles-nonredundant.txt  
  inflating: casmi2017/data/candidates/candidates-challenge-047-smiles-nonredundant.txt  
  inflating: casmi2017/data/candidates/candidates-challenge-048-smiles-nonredundant.txt  
  inflating: casmi2017/data/candidates/candidates-challenge-049-smiles-nonredundant.txt  
  inflating: casmi2017/data/candidates/candidates-challenge-050-smiles-nonredundant.txt  
  inflating: casmi2017/data/candidates/candidates-challenge-051-smiles-nonredundant.txt  
  inflating: casmi2017/data/candidates/candidates-challenge-052-smiles-nonredundant.txt  
  inflating: casmi2017/data/candidates/candidates-challenge-053-smiles-nonredundant.txt  
  inflating: casmi2017/data/candidates/candidates-challenge-054-smiles-nonredundant.txt  
  inflating: casmi2017/data/candidates/candidates-challenge-055-smiles-nonredundant.txt  
  inflating: casmi201

In [4]:
import pandas as pd
import csv
import os
import rdkit.Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
pd.options.mode.chained_assignment = None

outPath='casmi2017/candidates'
if not os.path.exists(outPath):
    os.makedirs(outPath)

parameter = pd.read_csv("casmi2017/data/casmi2017/summary-046-243.csv", sep='\t')
solutions = pd.read_csv("casmi2017/data/CASMI-solutions.csv", sep=',')

#loop over all challenges
for index, row in parameter.iterrows():
    print("Processing candidates for challenge: "+row['challengename'])
    candidates = pd.read_csv("casmi2017/data/candidates/"+row['challengename']+".txt",
                             sep="\t", header=None, names=['SMILES'], index_col=False)
    print(str(len(candidates.index))+" candidates found ...")
    
    #calculate neutral precursor mass from solution
    solution=solutions[(solutions['Challenge'] == row['challengename'])]
    mol=rdkit.Chem.MolFromInchi(solution['InChI'].iloc[0])
    exactMass=rdkit.Chem.rdMolDescriptors.CalcExactMolWt(mol)
    exactMassLow=exactMass*0.999995
    exactMassHigh=exactMass*1.000005

    candidates['mol'] = candidates.apply(lambda row : rdkit.Chem.MolFromSmiles(row['SMILES']), axis = 1)
    candidates.dropna(inplace=True)
    
    #filter candidates with 5ppm
    candidates['MonoisotopicMass'] = candidates.apply(lambda row : rdkit.Chem.rdMolDescriptors.CalcExactMolWt(row['mol']), axis = 1)
    candidatesTmp1 = candidates.loc[candidates['MonoisotopicMass'] >= exactMassLow]
    candidatesTmp2 = candidatesTmp1.loc[candidatesTmp1['MonoisotopicMass'] <= exactMassHigh]
    print(str(len(candidatesTmp2.index))+" candidates filtered ...")
    candidates=candidatesTmp2
    candidates['InChI'] = candidates.apply(lambda row : rdkit.Chem.MolToInchi(row['mol']), axis = 1)
    candidates['InChIKey'] = candidates.apply(lambda row : rdkit.Chem.MolToInchiKey(row['mol']), axis = 1)
    candidates['MolecularFormula'] = candidates.apply(lambda row : rdkit.Chem.rdMolDescriptors.CalcMolFormula(row['mol']), axis = 1)
    candidates.dropna(inplace=True)
    print(str(len(candidates.index))+" candidates successfully processed ...")
    
    #"Identifier","CompoundName","MonoisotopicMass","MolecularFormula","SMILES","InChI","InChIKey"
    candidatesToWrite=pd.DataFrame()
    candidatesToWrite['Identifier']=candidates['SMILES']
    candidatesToWrite['CompoundName']=candidates['SMILES']
    candidatesToWrite['MonoisotopicMass']=candidates['MonoisotopicMass']
    candidatesToWrite['SMILES']=candidates['SMILES']
    candidatesToWrite['InChI']=candidates['InChI']
    candidatesToWrite['InChIKey']=candidates['InChIKey']
    candidatesToWrite.to_csv(outPath+"/"+row['challengename']+".csv", index=False,
                             quoting=csv.QUOTE_NONNUMERIC)
    

Processing candidates for challenge: challenge-046
1925 candidates found ...
377 candidates filtered ...
377 candidates successfully processed ...
Processing candidates for challenge: challenge-047
516 candidates found ...
119 candidates filtered ...
119 candidates successfully processed ...
Processing candidates for challenge: challenge-048
10107 candidates found ...
2891 candidates filtered ...
2891 candidates successfully processed ...
Processing candidates for challenge: challenge-049
5399 candidates found ...
2368 candidates filtered ...
2368 candidates successfully processed ...
Processing candidates for challenge: challenge-050
11201 candidates found ...
3252 candidates filtered ...
3252 candidates successfully processed ...
Processing candidates for challenge: challenge-051
162 candidates found ...
35 candidates filtered ...
35 candidates successfully processed ...
Processing candidates for challenge: challenge-052
6267 candidates found ...
2698 candidates filtered ...
2698 can

* check if all solutions are in the candidate lists

In [5]:
%%bash
for x in casmi2017/parameters/challenge-* ; do
    filename=$(basename $x)
    inchikey=$(grep "InChIKey =" $x | awk '{print $4}')
    grep -q $inchikey casmi2017/candidates/${filename%txt}csv
    if [ $? -ne 0 ]; then
        echo ${x%.txt}": solution NOT in candidates"
    fi
done

* clean up data

In [6]:
%%bash
rm -fr casmi2017/data