Apply LDA to classify spectral embeddings:
    
    Tools: scikit-learn 0.24.1


##  LDA

In [1]:
import os
from matplotlib import pyplot as plt
import pickle
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt 
from collections import Counter
import pandas as pd

## Define functions

In [2]:
def define_data(X_embedding, y_label_list, min_group_size=1, remove_unclassified = True):
    y_label = np.array(y_label_list)
    # Kick out the unexpected groups     
    lbs = Counter(y_label)
    lbs = {k:v for k,v in sorted(lbs.items(), key=lambda item: item[1], reverse=True)}
    sel_index=[]
    for i,v in enumerate(list(lbs.values())):
        if v >= min_group_size:
            sel_index.append(i)
    select_lbs = [list(lbs.keys())[i] for i in sel_index]   
    if remove_unclassified == True:
        if 'Unclassified' in select_lbs:
            select_lbs.remove('Unclassified')           
    print(str(len(select_lbs))+' groups were selected.')    
    # Connect the selected groups to embeddings
    select_index = []
    for i in select_lbs:
        temp_index, = np.where(y_label==i)
        #print(temp_index)
        select_index += list(temp_index)
    select_index.sort()
    #print(select_index)
    y_label_2 = np.array(y_label[select_index])
    print(y_label_2.shape)
    X_embedding_2=X_embedding[select_index,:]
    print(X_embedding_2.shape)
    print(f'Selecting {len(select_index)} of {len(y_label_list)} spectra.')
    return X_embedding_2, y_label_2


def perform_lda(X_embedding, y_label, testing_size = 0.25, stratification=True):
    # Split the dataset into the Training set and Test set
    if stratification == True:
        X_train, X_test, y_train, y_test = train_test_split(X_embedding, y_label, test_size = testing_size, random_state = 0, stratify=y_label)
    else:
        X_train, X_test, y_train, y_test = train_test_split(X_embedding, y_label, test_size = testing_size, random_state = 0)
        
    # Apply Feature Scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
#     with open('/mnt/scratch/ding013/MS2ChemClass/embedding_arrays_for_classification/x_test_lda.pkl','wb') as f:
#         pickle.dump(X_test, f)
#     with open('/mnt/scratch/ding013/MS2ChemClass/embedding_arrays_for_classification/y_test_lda.pkl','wb') as f:
#         pickle.dump(y_test, f)

    # Apply LDA
    t0=time.process_time()
    lda_model = LinearDiscriminantAnalysis()
    lda_model.fit(X_train,y_train)
    t1=time.process_time()-t0
    print('Time elapsed: %.3f' % t1)

    # Predict the test set results and check the accuracy
    y_pred = lda_model.predict(X_test)
    accuracy_lda = accuracy_score(y_test,y_pred)
    print('Accuracy: %.3f' % accuracy_lda)

    labels =list(np.unique(np.array(list(y_test)+list(y_pred))))
    conf_mat = confusion_matrix(y_test,y_pred, labels = labels)
    print(conf_mat)
    return conf_mat,y_test,y_train,y_pred,labels


def plot_confusion_matrix(confusion_mat, labels, save_path, heatmap=False):    
    # Plot Confusion matrix  # Ignore: this plot is too large to plot.
    if len(labels) >= 10:
        a = len(labels)//5+5;
    else:
        a = 5
    
    df_cm = pd.DataFrame(confusion_mat, columns=np.unique(labels), index = np.unique(labels))
    df_cm.to_pickle(save_path+'.pkl')
    if heatmap==True:
        plt.figure(figsize = (a,a*4//5))
        ax= plt.subplot()
        df_cm.index.name = 'Actual labels'
        df_cm.columns.name = 'Predicted labels'
        sns.set(font_scale=1.8)
        sns.heatmap(df_cm, annot=True, ax = ax, cmap="YlGnBu", fmt='d',annot_kws={"size": 10})
        plt.savefig(save_path+'.svg',dpi=300,bbox_inches='tight')
    return df_cm
  
    
def calculate_accuracy_per_label(label, conf_matrix, y_train, save_path):
    accuracy = []
    group_size_pred=[]
    group_size_test=[]
    group_size_train=[]
    group_size = []
    predict_vs_test_size=[]
    group_size_train_dict = {k:list(y_train).count(k) for k in label}
    for i in range(len(label)):
        if label[i] != 'Unclassified':
            true_result = conf_matrix[i,i]
            all_result_pred = np.sum(conf_matrix,axis=0)[i]
            group_size_pred.append(all_result_pred)
            all_result_test = np.sum(conf_matrix,axis=1)[i]
            #print(all_result_pred, all_result_test)
            group_size_test.append(all_result_test)
            group_size_train.append(group_size_train_dict[label[i]])
            group_size.append(all_result_test+group_size_train_dict[label[i]])

            if all_result_pred == 0:
                accuracy.append(0)
            else:
                accuracy.append(true_result/all_result_pred)

            if all_result_test == 0:
                predict_vs_test_size.append(all_result_pred+1)
            else:
                predict_vs_test_size.append(all_result_pred/all_result_test)
    if 'Unclassified' in label:
        print('Removing Unclassified groups')
        label.remove('Unclassified')
    accuracy_dict = {'labels':label, 'accuracy':accuracy,'predict_vs_test_size': predict_vs_test_size,'group_size': group_size,
                     'group_size_pred':group_size_pred, 'group_size_test':group_size_test, 'group_size_train':group_size_train}
    with open(save_path, 'wb') as f:
        pickle.dump(accuracy_dict, f)
    return accuracy_dict


def scatter_plot_accuracy_per_label(accuracy_dict,show_annot_min_x, x_label, datatype, save_dir):
    x= accuracy_dict[x_label]
    y= accuracy_dict['accuracy']
    plt.rcParams.update({'figure.figsize':(10,7)})
    plt.scatter(x, y)

    plt.title('Accuracy of predicted results vs. %s in %s' % (x_label, datatype))
    plt.xlabel(x_label)
    plt.ylabel('Accuracy per group')
    for i,txt in enumerate(accuracy_dict['labels']):
        if x[i] >= show_annot_min_x:
            plt.annotate(txt,(x[i],y[i]), fontsize=10)
    plt.savefig(save_dir,dpi = 100, bbox_inches='tight')
    plt.show()
    

    

In [3]:
def define_data_give_indices(X_embedding, y_label_list, min_group_size=1, remove_unclassified = True):
    
    y_label = np.array(y_label_list)
    # Kick out the unexpected groups     
    lbs = Counter(y_label)
    lbs = {k:v for k,v in sorted(lbs.items(), key=lambda item: item[1], reverse=True)}
    sel_index=[]
    for i,v in enumerate(list(lbs.values())):
        if v >= min_group_size:
            sel_index.append(i)
    select_lbs = [list(lbs.keys())[i] for i in sel_index]
    
    if remove_unclassified == True:
        if 'Unclassified' in select_lbs:
            select_lbs.remove('Unclassified')           
    print(str(len(select_lbs))+' groups were selected.')    
    # Connect the selected groups to embeddings
    select_index = []
    for i in select_lbs:
        temp_index, = np.where(y_label==i)
        #print(temp_index)
        select_index += list(temp_index)
    select_index.sort()
    #print(select_index)
    y_label_2 = np.array(y_label[select_index])
    print(y_label_2.shape)
    X_embedding_2=X_embedding[select_index,:]
    print(X_embedding_2.shape)
    print(f'Selecting {len(select_index)} of {len(y_label_list)} spectra.')
    return X_embedding_2, y_label_2,select_index

## Loading array from files

In [4]:
## Loading embedding arrays
with open('/mnt/scratch/ding013/MS2ChemClass/embedding_arrays_for_classification/X_spectral_embeddings.csv') as file_name:
    X_embedding = np.loadtxt(file_name, delimiter=",")
print(X_embedding.shape)

(16360, 300)


## LDA for ClassyFire all data

### Class level in ClassyFire

In [182]:
# Find the indices of test and training sets
data_type='cf_class_lda'
with open('/mnt/scratch/ding013/MS2ChemClass/embedding_arrays_for_classification/y_cf_class_labels.pickle', 'rb') as file_name:
    y_label_list = pickle.load(file_name)
y_label = np.array(y_label_list)
print(y_label.shape)
X_embedding_2, y_label_2,select_index = define_data_give_indices(X_embedding, y_label_list, min_group_size=4, remove_unclassified = True)
indices = range(15034)
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(X_embedding_2, y_label_2,indices, test_size = 0.25, random_state = 0, stratify=y_label_2)
conf_mat,y_test,y_train,y_pred,labels= perform_lda(X_embedding_2, y_label_2, testing_size = 0.25, stratification=True)


(16360,)
183 groups were selected.
(15034,)
(15034, 300)
Selecting 15034 of 16360 spectra.
Time elapsed: 8.989
Accuracy: 0.351
[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 2]]


In [183]:
with open('/mnt/scratch/ding013/MS2ChemClass/embedding_arrays_for_classification/mass_da_ui.pickle', 'rb') as file_name:
    mass_list = pickle.load(file_name)
with open('/mnt/scratch/ding013/MS2ChemClass/embedding_arrays_for_classification/inchi_list_ui.pickle', 'rb') as file_name:
    inchi_list = pickle.load(file_name)

In [184]:
mass_selected = [mass_list[i] for i in select_index]
inchi_selected = [inchi_list[i] for i in select_index]
mass_test = [mass_selected[i] for i in indices_test]
inchi_test = [inchi_selected[i] for i in indices_test]

In [185]:
# glycerolipids
glycero_indices = [i for i, x in enumerate(y_test) if x == "Glycerolipids"]
mass_selected_glycero_test = [mass_test[i] for i in glycero_indices]
inchi_selected_glycero_test= [inchi_test[i] for i in glycero_indices]
# Save mass dalton information
with open('/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/Glycerolipids_test_inchikey.pickle','wb') as f:
    pickle.dump(inchi_selected_glycero_test,f)
with open('/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/Glycerolipids_test_mass.pickle','wb') as f:
    pickle.dump(mass_selected_glycero_test,f)

In [186]:
import glob

In [187]:
# get the list of canopus results path
txtfiles = []
for file in glob.glob('/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/glycerolipids/*.mgf'):
    txtfiles.append(file.split('.')[0])

# collect predicted results from canopus results

pred_com_list=[]
for i in txtfiles:
    if os.path.isfile(f'{i}/canopus_summary.tsv'):
        with open(f'{i}/canopus_summary.tsv', 'r') as file_name:
            cano_out = pd.read_csv(file_name, delimiter="\t")
        try:
            pred_com=cano_out['class'][0]
            print(pred_com)
            pred_com_list.append(pred_com)
        except:
            pass

print(len(pred_com_list),pred_com_list)

Glycerolipids
Glycerolipids
Sphingolipids
Sphingolipids
Glycerolipids
Glycerolipids
Glycerolipids
Glycerolipids
Glycerolipids
Glycerolipids
Glycerolipids
11 ['Glycerolipids', 'Glycerolipids', 'Sphingolipids', 'Sphingolipids', 'Glycerolipids', 'Glycerolipids', 'Glycerolipids', 'Glycerolipids', 'Glycerolipids', 'Glycerolipids', 'Glycerolipids']


In [188]:
# Other compound
com = 'Peptidomimetics'
compound_indices = [i for i, x in enumerate(y_test) if x == com]
mass_selected_compound_test = [mass_test[i] for i in compound_indices]
inchi_selected_compound_test= [inchi_test[i] for i in compound_indices]
# Save mass dalton information
with open(f'/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/{com}_test_inchikey.pickle','wb') as f:
    pickle.dump(inchi_selected_compound_test,f)
with open(f'/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/{com}_test_mass.pickle','wb') as f:
    pickle.dump(mass_selected_compound_test,f)
print(len(inchi_selected_compound_test))
#print(inchi_selected_compound_test)
# print(mass_selected_compound_test)

122


In [189]:
# get the list of canopus results path
txtfiles = []
for file in glob.glob('/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/Peptidomimetics/*.mgf'):
    txtfiles.append(file.split('.')[0])

# collect predicted results from canopus results

pred_com_list=[]
for i in txtfiles:
    if os.path.isfile(f'{i}/canopus_summary.tsv'):
        with open(f'{i}/canopus_summary.tsv', 'r') as file_name:
            cano_out = pd.read_csv(file_name, delimiter="\t")
        try:
            pred_com=cano_out['class'][0]
            # print(pred_com)
            pred_com_list.append(pred_com)
        except:
            pass

print(len(pred_com_list),pred_com_list)

44 ['Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Carboxylic acids and derivatives', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Carboxylic acids and derivatives', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Carboxylic acids and derivatives', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics', 'Peptidomimetics']


In [190]:
with open('/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/peptidomimetics_canopus_out/canopus_summary.tsv', 'r') as file_name:
    cano_out = pd.read_csv(file_name, delimiter="\t")


In [191]:
# Other compound
com = 'Tetracyclines'
compound_indices = [i for i, x in enumerate(y_test) if x == com]
mass_selected_compound_test = [mass_test[i] for i in compound_indices]
inchi_selected_compound_test= [inchi_test[i] for i in compound_indices]
# Save mass dalton information
with open(f'/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/{com}_test_inchikey.pickle','wb') as f:
    pickle.dump(inchi_selected_compound_test,f)
with open(f'/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/{com}_test_mass.pickle','wb') as f:
    pickle.dump(mass_selected_compound_test,f)
print(len(inchi_selected_compound_test))
print(inchi_selected_compound_test)
print(mass_selected_compound_test)

8
['CCMSLIB00000085443', 'CCMSLIB00000077057', 'CCMSLIB00000085161', 'CCMSLIB00000078515', 'CCMSLIB00000086191', 'CCMSLIB00000077091', 'CCMSLIB00005723226', 'CCMSLIB00005734807']
[444.156724, 464.101724, 457.187724, 444.153724, 444.15772400000003, 444.156724, 442.138724, 457.184724]


In [192]:
# Other compound
com = 'Prenol_lipids'
compound_indices = [i for i, x in enumerate(y_test) if x == 'Prenol lipids']
mass_selected_compound_test = [mass_test[i] for i in compound_indices]
inchi_selected_compound_test= [inchi_test[i] for i in compound_indices]
# Save mass dalton information
with open(f'/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/{com}_test_inchikey.pickle','wb') as f:
    pickle.dump(inchi_selected_compound_test,f)
with open(f'/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/{com}_test_mass.pickle','wb') as f:
    pickle.dump(mass_selected_compound_test,f)
print(len(inchi_selected_compound_test))
# print(inchi_selected_compound_test)
# print(mass_selected_compound_test)

459


In [193]:
# Other compound
com = 'Organooxygen_compounds'
compound_indices = [i for i, x in enumerate(y_test) if x == 'Organooxygen compounds']
mass_selected_compound_test = [mass_test[i] for i in compound_indices]
inchi_selected_compound_test= [inchi_test[i] for i in compound_indices]
# Save mass dalton information
with open(f'/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/{com}_test_inchikey.pickle','wb') as f:
    pickle.dump(inchi_selected_compound_test,f)
with open(f'/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/{com}_test_mass.pickle','wb') as f:
    pickle.dump(mass_selected_compound_test,f)
print(len(inchi_selected_compound_test))
#print(inchi_selected_compound_test)
print(mass_selected_compound_test[0])

228
458.278724


In [194]:
# get the list of canopus results path
txtfiles = []
for file in glob.glob('/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/Organooxygen_compounds/*.mgf'):
    txtfiles.append(file.split('.')[0])

# collect predicted results from canopus results

pred_com_list=[]
for i in txtfiles:
    if os.path.isfile(f'{i}/canopus_summary.tsv'):
        with open(f'{i}/canopus_summary.tsv', 'r') as file_name:
            cano_out = pd.read_csv(file_name, delimiter="\t")
        try:
            pred_com=cano_out['class'][0]
            # print(pred_com)
            pred_com_list.append(pred_com)
        except:
            pass

print(len(pred_com_list),pred_com_list[0])

90 Organooxygen compounds


In [195]:
# canopus false positive
fn=0
for i in pred_com_list:
    if i!="Organooxygen compounds":
        fn+=1
print(fn)
print(len(pred_com_list))

19
90


In [196]:
# ms2chemclass false positive
com_pred = list(y_pred[compound_indices])
fn=0
for i in com_pred:
    if i!="Organooxygen compounds":
        fn+=1
print(fn)
print(len(com_pred))

181
228


## Save the selected spectrum into mgf for CANOPUS

In [197]:
from matchms.exporting import save_as_mgf
from pathlib import Path
import os
import gensim
import pickle
import sys
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from collections import OrderedDict, Counter

In [199]:
data_path = "/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/datasets/ALL_GNPS_210409_positive/"
base = "ALL_GNPS_210409_positive_cleaned"
spectra_data = "_peaks_processed_s2v_only_annotated.pickle"

embedding_path = "/mnt/LTR_userdata/hooft001/mass_spectral_embeddings/embeddings/ALL_GNPS_210409_positive/ALL_GNPS_210409_positive_cleaned_spec2vec_embedding_iter_15.model"
save_path = "/mnt/scratch/ding013/MS2ChemClass/hooft_data/embedding_visualization_out/"

spectrum_file = os.path.join(data_path, base+".pickle")
os.path.exists(spectrum_file)

processed_spectrums_file = os.path.join(data_path, base+spectra_data)
with open(processed_spectrums_file, 'rb') as inf:
    spectrums_processed = pickle.load(inf)

len(spectrums_processed)

# TEST DATA
# spectrums_processed = spectrums_processed[1:100]
# len(spectrums_processed)


# ### Keep annotated spectra
# Annotated with inchikey

annot_spectrums_processed = []
for spec in spectrums_processed:
    inchikey = spec.metadata.get("inchikey")
    if inchikey:
        annot_spectrums_processed.append(spec)
len(annot_spectrums_processed)

spectrums_ids = [s.metadata.get("spectrum_id") for s in annot_spectrums_processed]


any([s==None for s in spectrums_ids])

print(spectrums_processed[1])


<matchms.Spectrum.Spectrum object at 0x7fcb63b93d60>


In [200]:
inchi_selected_glycero_test=['CCMSLIB00000563345', 'CCMSLIB00005724151', 'CCMSLIB00003129211', 'CCMSLIB00003093478', 'CCMSLIB00006109919', 'CCMSLIB00003100705', 'CCMSLIB00003088299', 'CCMSLIB00003106655', 'CCMSLIB00003093986', 'CCMSLIB00003092599', 'CCMSLIB00003093304', 'CCMSLIB00003093815', 'CCMSLIB00000849055', 'CCMSLIB00003093912', 'CCMSLIB00003092296', 'CCMSLIB00003094220', 'CCMSLIB00003091642', 'CCMSLIB00000563325', 'CCMSLIB00003094925', 'CCMSLIB00005723381', 'CCMSLIB00000563319', 'CCMSLIB00003128263', 'CCMSLIB00003139659', 'CCMSLIB00003138839', 'CCMSLIB00000563317', 'CCMSLIB00003087863', 'CCMSLIB00003109606', 'CCMSLIB00003095340', 'CCMSLIB00003119228', 'CCMSLIB00003088561', 'CCMSLIB00000563312', 'CCMSLIB00005724131', 'CCMSLIB00000563321', 'CCMSLIB00003129207', 'CCMSLIB00005724150', 'CCMSLIB00000563346', 'CCMSLIB00003094106', 'CCMSLIB00003100671', 'CCMSLIB00005436305', 'CCMSLIB00003090393', 'CCMSLIB00005724144', 'CCMSLIB00000563358', 'CCMSLIB00003110521', 'CCMSLIB00003096235', 'CCMSLIB00005724130']
for s in inchi_selected_glycero_test:
    selected_spec=[spectrums_processed[i] for i,v in enumerate(spectrums_ids) if v==s]
    save_as_mgf(selected_spec,f'/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/glycerolipids/{s}test_inchikey.mgf')


In [201]:
inchi_test_compound=['CCMSLIB00005436092', 'CCMSLIB00000577703', 'CCMSLIB00005723403', 'CCMSLIB00000001638', 'CCMSLIB00000577749', 'CCMSLIB00000086129', 'CCMSLIB00000001650', 'CCMSLIB00000577808', 'CCMSLIB00000577842', 'CCMSLIB00000854741', 'CCMSLIB00000001790', 'CCMSLIB00003742128', 'CCMSLIB00000577624', 'CCMSLIB00000577571', 'CCMSLIB00000846978', 'CCMSLIB00000577756', 'CCMSLIB00000577747', 'CCMSLIB00005723406', 'CCMSLIB00000071755', 'CCMSLIB00000577587', 'CCMSLIB00000478654', 'CCMSLIB00005724028', 'CCMSLIB00000001815', 'CCMSLIB00000075068', 'CCMSLIB00005720239', 'CCMSLIB00000001642', 'CCMSLIB00000577819', 'CCMSLIB00000853980', 'CCMSLIB00000577834', 'CCMSLIB00000853336', 'CCMSLIB00000848828', 'CCMSLIB00000577671', 'CCMSLIB00001059667', 'CCMSLIB00004679304', 'CCMSLIB00000577662', 'CCMSLIB00000001608', 'CCMSLIB00000577508', 'CCMSLIB00000577789', 'CCMSLIB00000079981', 'CCMSLIB00000577639', 'CCMSLIB00000577503', 'CCMSLIB00000424841', 'CCMSLIB00000080590', 'CCMSLIB00000085222', 'CCMSLIB00000577642', 'CCMSLIB00000839208', 'CCMSLIB00000001792', 'CCMSLIB00000577577', 'CCMSLIB00004722213', 'CCMSLIB00000071756', 'CCMSLIB00001059678', 'CCMSLIB00000079862', 'CCMSLIB00000577863', 'CCMSLIB00000081215', 'CCMSLIB00000577581', 'CCMSLIB00000578044', 'CCMSLIB00000577682', 'CCMSLIB00005885157', 'CCMSLIB00005463473', 'CCMSLIB00000080545', 'CCMSLIB00000577678', 'CCMSLIB00000079888', 'CCMSLIB00005435752', 'CCMSLIB00000478102', 'CCMSLIB00005436483', 'CCMSLIB00000577751', 'CCMSLIB00000853573', 'CCMSLIB00000577858', 'CCMSLIB00000577831', 'CCMSLIB00005463474', 'CCMSLIB00000577586', 'CCMSLIB00000001601', 'CCMSLIB00005723394', 'CCMSLIB00000079782', 'CCMSLIB00005435723', 'CCMSLIB00000577588', 'CCMSLIB00005691876', 'CCMSLIB00000577657', 'CCMSLIB00000577573', 'CCMSLIB00004679300', 'CCMSLIB00000577598', 'CCMSLIB00000079361', 'CCMSLIB00000577783', 'CCMSLIB00005720211', 'CCMSLIB00000577520', 'CCMSLIB00004679299', 'CCMSLIB00000577569', 'CCMSLIB00005723386', 'CCMSLIB00000577764', 'CCMSLIB00000577585', 'CCMSLIB00000848328', 'CCMSLIB00000080038', 'CCMSLIB00000577777', 'CCMSLIB00000079837', 'CCMSLIB00000577840', 'CCMSLIB00005724299', 'CCMSLIB00005435658', 'CCMSLIB00000577628', 'CCMSLIB00000577651', 'CCMSLIB00000577617', 'CCMSLIB00000566712', 'CCMSLIB00000577514', 'CCMSLIB00000079661', 'CCMSLIB00000577645', 'CCMSLIB00000577635', 'CCMSLIB00000577688', 'CCMSLIB00000577512', 'CCMSLIB00000001683', 'CCMSLIB00001059628', 'CCMSLIB00000577823', 'CCMSLIB00000854691', 'CCMSLIB00000080533', 'CCMSLIB00000577661', 'CCMSLIB00000577687', 'CCMSLIB00000577731', 'CCMSLIB00000847844', 'CCMSLIB00000079728', 'CCMSLIB00000577794', 'CCMSLIB00000577716', 'CCMSLIB00000223994', 'CCMSLIB00004722106', 'CCMSLIB00000221414']
path1='/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/Peptidomimetics'
Path(path1).mkdir(parents=True, exist_ok=True)

for s in inchi_test_compound:
    selected_spec=[spectrums_processed[i] for i,v in enumerate(spectrums_ids) if v==s]
    save_as_mgf(selected_spec, f'{path1}/{s}test_inchikey.mgf')

In [202]:
inchi_test_compound=['CCMSLIB00003135802', 'CCMSLIB00000846231', 'CCMSLIB00000578009', 'CCMSLIB00000848312', 'CCMSLIB00005738684', 'CCMSLIB00000847793', 'CCMSLIB00005883997', 'CCMSLIB00000846633', 'CCMSLIB00000079034', 'CCMSLIB00006105372', 'CCMSLIB00005978342', 'CCMSLIB00004685140', 'CCMSLIB00005738463', 'CCMSLIB00004692117', 'CCMSLIB00005765551', 'CCMSLIB00006089151', 'CCMSLIB00000579640', 'CCMSLIB00000848082', 'CCMSLIB00000578092', 'CCMSLIB00000085753', 'CCMSLIB00000424801', 'CCMSLIB00005738496', 'CCMSLIB00000425810', 'CCMSLIB00005975346', 'CCMSLIB00000849988', 'CCMSLIB00000845020', 'CCMSLIB00003135730', 'CCMSLIB00000214796', 'CCMSLIB00000072523', 'CCMSLIB00000846122', 'CCMSLIB00005954846', 'CCMSLIB00004690281', 'CCMSLIB00001058649', 'CCMSLIB00005758001', 'CCMSLIB00000852432', 'CCMSLIB00000205849', 'CCMSLIB00000846802', 'CCMSLIB00006063904', 'CCMSLIB00004694816', 'CCMSLIB00005955373', 'CCMSLIB00000850395', 'CCMSLIB00003134680', 'CCMSLIB00005463617', 'CCMSLIB00000005006', 'CCMSLIB00000853966', 'CCMSLIB00004713669', 'CCMSLIB00000849505', 'CCMSLIB00000077069', 'CCMSLIB00005723253', 'CCMSLIB00000855752', 'CCMSLIB00000079091', 'CCMSLIB00000425678', 'CCMSLIB00000848091', 'CCMSLIB00000855533', 'CCMSLIB00006084383', 'CCMSLIB00000856011', 'CCMSLIB00000854639', 'CCMSLIB00005489615', 'CCMSLIB00000085602', 'CCMSLIB00000848273', 'CCMSLIB00003134937', 'CCMSLIB00006103907', 'CCMSLIB00000079166', 'CCMSLIB00000219050', 'CCMSLIB00000847399', 'CCMSLIB00000426012', 'CCMSLIB00000855412', 'CCMSLIB00005884281', 'CCMSLIB00000855499', 'CCMSLIB00000855222', 'CCMSLIB00005435627', 'CCMSLIB00000078995', 'CCMSLIB00000085326', 'CCMSLIB00000855463', 'CCMSLIB00000214603', 'CCMSLIB00000001649', 'CCMSLIB00003142425', 'CCMSLIB00000205794', 'CCMSLIB00004703046', 'CCMSLIB00000849766', 'CCMSLIB00004719768', 'CCMSLIB00005435769', 'CCMSLIB00000855898', 'CCMSLIB00000855511', 'CCMSLIB00005719846', 'CCMSLIB00005435944', 'CCMSLIB00000577890', 'CCMSLIB00005772734', 'CCMSLIB00005435939', 'CCMSLIB00005774173', 'CCMSLIB00005720531', 'CCMSLIB00000848590', 'CCMSLIB00003142429', 'CCMSLIB00000085593', 'CCMSLIB00005719784', 'CCMSLIB00000085325', 'CCMSLIB00000848630', 'CCMSLIB00000214816', 'CCMSLIB00003135492', 'CCMSLIB00000854529', 'CCMSLIB00004679309', 'CCMSLIB00006038441', 'CCMSLIB00000214921', 'CCMSLIB00000085654', 'CCMSLIB00001059011', 'CCMSLIB00000425943', 'CCMSLIB00004694514', 'CCMSLIB00004691379', 'CCMSLIB00000079836', 'CCMSLIB00000848774', 'CCMSLIB00000218044', 'CCMSLIB00000853803', 'CCMSLIB00000847572', 'CCMSLIB00000855351', 'CCMSLIB00000579687', 'CCMSLIB00003134817', 'CCMSLIB00000426803', 'CCMSLIB00000205792', 'CCMSLIB00000853476', 'CCMSLIB00000854809', 'CCMSLIB00005877203', 'CCMSLIB00000848364', 'CCMSLIB00005463708', 'CCMSLIB00000851585', 'CCMSLIB00004709062', 'CCMSLIB00000848747', 'CCMSLIB00000854647', 'CCMSLIB00000852263', 'CCMSLIB00000852843', 'CCMSLIB00005719884', 'CCMSLIB00000851357', 'CCMSLIB00000845675', 'CCMSLIB00000425220', 'CCMSLIB00006107295', 'CCMSLIB00000425114', 'CCMSLIB00004716366', 'CCMSLIB00000574567', 'CCMSLIB00000855115', 'CCMSLIB00000084800', 'CCMSLIB00004718678', 'CCMSLIB00000848716', 'CCMSLIB00003135255', 'CCMSLIB00005435770', 'CCMSLIB00000078406', 'CCMSLIB00000839294', 'CCMSLIB00005725426', 'CCMSLIB00000214968', 'CCMSLIB00000850138', 'CCMSLIB00004711959', 'CCMSLIB00000853567', 'CCMSLIB00005966779', 'CCMSLIB00000078993', 'CCMSLIB00000215071', 'CCMSLIB00005467665', 'CCMSLIB00000853035', 'CCMSLIB00005749428', 'CCMSLIB00004719352', 'CCMSLIB00000425466', 'CCMSLIB00000214541', 'CCMSLIB00005760292', 'CCMSLIB00000219905', 'CCMSLIB00006024814', 'CCMSLIB00006105014', 'CCMSLIB00000847343', 'CCMSLIB00000854811', 'CCMSLIB00000853698', 'CCMSLIB00005721079', 'CCMSLIB00000214985', 'CCMSLIB00001058957', 'CCMSLIB00000847688', 'CCMSLIB00004702794', 'CCMSLIB00000847531', 'CCMSLIB00000079405', 'CCMSLIB00000845516', 'CCMSLIB00000217781', 'CCMSLIB00000214573', 'CCMSLIB00000853730', 'CCMSLIB00000426344', 'CCMSLIB00000848650', 'CCMSLIB00000214952', 'CCMSLIB00003142416', 'CCMSLIB00000078572', 'CCMSLIB00005735790', 'CCMSLIB00000850752', 'CCMSLIB00000577925', 'CCMSLIB00000085517', 'CCMSLIB00004712560', 'CCMSLIB00004751444', 'CCMSLIB00005764009', 'CCMSLIB00000221394', 'CCMSLIB00000214706', 'CCMSLIB00000854514', 'CCMSLIB00005723220', 'CCMSLIB00005953875', 'CCMSLIB00000848632', 'CCMSLIB00004704522', 'CCMSLIB00000850693', 'CCMSLIB00000574569', 'CCMSLIB00000004647', 'CCMSLIB00000080277', 'CCMSLIB00000855669', 'CCMSLIB00000079412', 'CCMSLIB00000577981', 'CCMSLIB00004696188', 'CCMSLIB00003135964', 'CCMSLIB00000855480', 'CCMSLIB00006065669', 'CCMSLIB00006043723', 'CCMSLIB00003135520', 'CCMSLIB00000840420', 'CCMSLIB00006061115', 'CCMSLIB00005467663', 'CCMSLIB00000849996', 'CCMSLIB00005734034', 'CCMSLIB00000853316', 'CCMSLIB00005435626', 'CCMSLIB00005982469', 'CCMSLIB00005972202', 'CCMSLIB00000079988', 'CCMSLIB00000850399', 'CCMSLIB00005757138', 'CCMSLIB00005725152', 'CCMSLIB00000855916', 'CCMSLIB00000213672', 'CCMSLIB00005435628', 'CCMSLIB00000853697', 'CCMSLIB00000851691', 'CCMSLIB00000848429']
path1='/mnt/scratch/ding013/MS2ChemClass/test_data_for_canopus/Organooxygen_compounds'
Path(path1).mkdir(parents=True, exist_ok=True)

for s in inchi_test_compound:
    selected_spec=[spectrums_processed[i] for i,v in enumerate(spectrums_ids) if v==s]
    save_as_mgf(selected_spec, f'{path1}/{s}test_inchikey.mgf')
