## Alignment of peaks from several _Vitis_ cultivars

Alignment of peak lists based on m/z relative differences (below a ppm threshold). 

Requirements:

- metabolinks


In [1]:
from collections import OrderedDict
from metabolinks import align, read_aligned_spectra, read_spectra_from_xcel
from metabolinks.peak_alignment import save_aligned_to_excel
from metabolinks.similarity import mz_similarity

### Set up metadata descriptions

In [2]:
data_folder = 'data'
header_row = 4

data = {
    'CAN': {'filename': 'CAN (14, 15, 16).xlsx',
            'names'   : {'sample_names': '14 15 16'.split(), 'labels' : 'CAN'}},
    'CS':  {'filename': 'CS (29, 30, 31).xlsx',
            'names'   : {'sample_names': '29 30 31'.split(), 'labels' : 'CS'}},
    'LAB':  {'filename': 'LAB (8, 9, 10).xlsx',
            'names'   : {'sample_names': '8  9  10'.split(), 'labels' : 'LAB'}},
    'PN':  {'filename': 'PN (23, 24, 25).xlsx',
            'names'   : {'sample_names': '23 24 25'.split(), 'labels' : 'PN'}},
    'REG':  {'filename': 'REG (38, 39, 40).xlsx',
            'names'   : {'sample_names': '38 39 40'.split(), 'labels' : 'REG'}},
    'RIP':  {'filename': 'RIP (17, 18, 19).xlsx',
            'names'   : {'sample_names': '17 18 19'.split(), 'labels' : 'RIP'}},
    'RL':  {'filename': 'RL (26, 27, 28).xlsx',
            'names'   : {'sample_names': '26 27 28'.split(), 'labels' : 'RL'}},
    'ROT':  {'filename': 'ROT (20, 21, 22).xlsx',
            'names'   : {'sample_names': '20 21 22'.split(), 'labels' : 'ROT'}},
    'RU':  {'filename': 'RU (35, 36, 37).xlsx',
            'names'   : {'sample_names': '35 36 37'.split(), 'labels' : 'RU'}},
    'SYL':  {'filename': 'SYL (11, 12, 13).xlsx',
            'names'   : {'sample_names': '11 12 13'.split(), 'labels' : 'SYL'}},
    'TRI':  {'filename': 'TRI (32, 33, 34).xlsx',
            'names'   : {'sample_names': '32 33 34'.split(), 'labels' : 'TRI'}},
    # these are the new cultivars
    'CFN':  {'filename': 'CFN (10713_1, 10713_2, 10713_3).xlsx',
            'names'   : {'sample_names': '10713-1 10713-2 10713-3'.split(), 'labels' : 'CFN'}},
    'CHT':  {'filename': 'CHT (13514_1, 13514_2, 13514_3).xlsx',
            'names'   : {'sample_names': '13514-1 13514-2 13514-3'.split(), 'labels' : 'CHT'}},
    'SB':  {'filename': 'SB (53211_1, 53211_2, 53211_3).xlsx',
            'names'   : {'sample_names': '53211-1 53211-2 53211-3'.split(), 'labels' : 'SB'}},
}


### Read spectra from Excel files

In [3]:
all_spectra = OrderedDict()

for d, desc in data.items():
    fname = data_folder+'/'+ desc['filename']
    spectra = read_spectra_from_xcel(fname,
                                     header_row=header_row, 
                                     **(desc['names']))
    for sheet, spectrum in spectra.items():
        all_spectra[sheet] = spectrum
print('Peak list names (identical to Excel sheet names)')
for name in all_spectra:
    print(name)

------ Reading MS-Excel file - data/CAN (14, 15, 16).xlsx
- 3 spectra found in sheet "CAN - NEGATIVO":
  581 peaks in sample 14, with label CAN
  590 peaks in sample 15, with label CAN
  515 peaks in sample 16, with label CAN
- 3 spectra found in sheet "CAN - POSITIVO":
 1159 peaks in sample 14, with label CAN
 1424 peaks in sample 15, with label CAN
 1193 peaks in sample 16, with label CAN
------ Reading MS-Excel file - data/CS (29, 30, 31).xlsx
- 3 spectra found in sheet "CS - NEGATIVO":
  768 peaks in sample 29, with label CS
  838 peaks in sample 30, with label CS
  710 peaks in sample 31, with label CS
- 3 spectra found in sheet "CS - Positivo":
 1642 peaks in sample 29, with label CS
 1599 peaks in sample 30, with label CS
 1631 peaks in sample 31, with label CS
------ Reading MS-Excel file - data/LAB (8, 9, 10).xlsx
- 3 spectra found in sheet "LAB - NEGATIVO":
  450 peaks in sample 8, with label LAB
  499 peaks in sample 9, with label LAB
  541 peaks in sample 10, with label LAB

### Alignment of peak lists

#### Align for each mode and cultivar (keep if peak appears in at least 2 samples)

In [4]:
ppmtol = 1.0
min_samples = 2

aligned = {}
for k, s in all_spectra.items():
    print('=======================================')
    print(k)
    aligned[k]  = align(s, ppmtol, min_samples)

CAN - NEGATIVO
------ Aligning spectra -------------
  Sample names: [['14'], ['15'], ['16']]
  Labels: ['CAN', 'CAN', 'CAN']
- Joining data... done, (total 1686 peaks in 3 spectra)
- Aligning... done, 1022 aligned peaks
Elapsed time: 00m 03.440s
Number of peaks: 475
  442 peaks in sample 14, with label CAN
  319 peaks in sample 15, with label CAN
  378 peaks in sample 16, with label CAN
- 547 peaks were discarded (#samples < 2)
Sample coverage of peaks
  286 peaks in 2 samples
  189 peaks in 3 samples
m/z range distribution
  [0.0,0.1[ : 17
  [0.1,0.2[ : 19
  [0.2,0.3[ : 25
  [0.3,0.4[ : 45
  [0.4,0.5[ : 76
  [0.5,0.6[ : 119
  [0.6,0.7[ : 81
  [0.7,0.8[ : 33
  [0.8,0.9[ : 30
  [0.9,1.0[ : 30
No peaks found with m/z range > 1.0
CAN - POSITIVO
------ Aligning spectra -------------
  Sample names: [['14'], ['15'], ['16']]
  Labels: ['CAN', 'CAN', 'CAN']
- Joining data... done, (total 3776 peaks in 3 spectra)
- Aligning... done, 3144 aligned peaks
Elapsed time: 00m 08.746s
Number of peaks

- Aligning... done, 3189 aligned peaks
Elapsed time: 00m 08.399s
Number of peaks: 338
  307 peaks in sample 17, with label RIP
  291 peaks in sample 18, with label RIP
  208 peaks in sample 19, with label RIP
- 2851 peaks were discarded (#samples < 2)
Sample coverage of peaks
  208 peaks in 2 samples
  130 peaks in 3 samples
m/z range distribution
  [0.0,0.1[ : 42
  [0.1,0.2[ : 45
  [0.2,0.3[ : 34
  [0.3,0.4[ : 47
  [0.4,0.5[ : 27
  [0.5,0.6[ : 34
  [0.6,0.7[ : 32
  [0.7,0.8[ : 32
  [0.8,0.9[ : 26
  [0.9,1.0[ : 19
No peaks found with m/z range > 1.0
RL - NEGATIVO
------ Aligning spectra -------------
  Sample names: [['26'], ['27'], ['28']]
  Labels: ['RL', 'RL', 'RL']
- Joining data... done, (total 2297 peaks in 3 spectra)
- Aligning... done, 1287 aligned peaks
Elapsed time: 00m 04.133s
Number of peaks: 655
  586 peaks in sample 26, with label RL
  487 peaks in sample 27, with label RL
  592 peaks in sample 28, with label RL
- 632 peaks were discarded (#samples < 2)
Sample coverage of

- Aligning... done, 356 aligned peaks
Elapsed time: 00m 01.108s
Number of peaks: 149
  105 peaks in sample 10713-1, with label CFN
  108 peaks in sample 10713-2, with label CFN
  144 peaks in sample 10713-3, with label CFN
- 207 peaks were discarded (#samples < 2)
Sample coverage of peaks
   90 peaks in 2 samples
   59 peaks in 3 samples
m/z range distribution
  [0.0,0.1[ : 13
  [0.1,0.2[ : 8
  [0.2,0.3[ : 15
  [0.3,0.4[ : 25
  [0.4,0.5[ : 20
  [0.5,0.6[ : 13
  [0.6,0.7[ : 14
  [0.7,0.8[ : 15
  [0.8,0.9[ : 14
  [0.9,1.0[ : 12
No peaks found with m/z range > 1.0
CFN - POSITIVO
------ Aligning spectra -------------
  Sample names: [['10713-1'], ['10713-2'], ['10713-3']]
  Labels: ['CFN', 'CFN', 'CFN']
- Joining data... done, (total 3337 peaks in 3 spectra)
- Aligning... done, 2981 aligned peaks
Elapsed time: 00m 07.854s
Number of peaks: 263
  204 peaks in sample 10713-1, with label CFN
  196 peaks in sample 10713-2, with label CFN
  219 peaks in sample 10713-3, with label CFN
- 2718 peak

#### Save, just in case, separating modes

In [5]:
aligned_pos = {name : value for name,value in aligned.items() if name.upper().endswith('POSITIVO')}
aligned_neg = {name : value for name,value in aligned.items() if name.upper().endswith('NEGATIVO')}

save_aligned_to_excel('aligned_cultivars_positive_1ppm_min2.xlsx', aligned_pos)
save_aligned_to_excel('aligned_cultivars_negative_1ppm_min2.xlsx', aligned_neg)

Created file
aligned_cultivars_positive_1ppm_min2.xlsx
Created file
aligned_cultivars_negative_1ppm_min2.xlsx


#### Align globally (for each mode), starting from previouslyaligned peak lists

In [6]:
ppmtol = 1.0
min_samples = 2
positive = aligned_pos.values()
negative = aligned_neg.values()

aligned_all_pos = align(positive, ppmtol=ppmtol, min_samples=min_samples)
aligned_all_neg = align(negative, ppmtol=ppmtol, min_samples=min_samples)

------ Aligning spectra -------------
  Sample names: [['14', '15', '16'], ['29', '30', '31'], ['8', '9', '10'], ['23', '24', '25'], ['38', '39', '40'], ['17', '18', '19'], ['26', '27', '28'], ['20', '21', '22'], ['35', '36', '37'], ['32', '33', '34'], ['10713-1', '10713-2', '10713-3'], ['13514-1', '13514-2', '13514-3'], ['53211-1', '53211-2', '53211-3']]
  Labels: ['CAN', 'CAN', 'CAN', 'CS', 'CS', 'CS', 'LAB', 'LAB', 'LAB', 'PN', 'PN', 'PN', 'REG', 'REG', 'REG', 'RIP', 'RIP', 'RIP', 'RL', 'RL', 'RL', 'ROT', 'ROT', 'ROT', 'RU', 'RU', 'RU', 'TRI', 'TRI', 'TRI', 'CFN', 'CFN', 'CFN', 'CHT', 'CHT', 'CHT', 'SB', 'SB', 'SB']
- Joining data... done, (total 10997 peaks in 13 spectra)
- Aligning... done, 5163 aligned peaks
Elapsed time: 00m 18.883s
Number of peaks: 5163
  330 peaks in sample 14, with label CAN
  379 peaks in sample 15, with label CAN
  355 peaks in sample 16, with label CAN
  667 peaks in sample 29, with label CS
  612 peaks in sample 30, with label CS
  624 peaks in sample 31,

#### Save alignments in Excel and CSV files

In [7]:
outdict = {'POSITIVE': aligned_all_pos, 'NEGATIVE': aligned_all_neg}
save_aligned_to_excel('aligned_1ppm_min2_1ppm.xlsx', outdict)

aligned_all_pos.to_csv('aligned_1ppm_min2_1ppm_positive.csv', with_labels=True, sep=',')
aligned_all_neg.to_csv('aligned_1ppm_min2_1ppm_negative.csv', with_labels=True, sep=',')

Created file
aligned_1ppm_min2_1ppm.xlsx


### REPEAT alignments, this time requiring presence of a peak in all replicas within each label

In [4]:
ppmtol = 1.0
min_samples = 3

aligned = {}
for k, s in all_spectra.items():
    print('=======================================')
    print(k)
    aligned[k]  = align(s, ppmtol, min_samples)

CAN - NEGATIVO
------ Aligning spectra -------------
  Sample names: [['14'], ['15'], ['16']]
  Labels: ['CAN', 'CAN', 'CAN']
- Joining data... done, (total 1686 peaks in 3 spectra)
- Aligning... done, 1022 aligned peaks
Elapsed time: 00m 03.357s
Number of peaks: 189
  189 peaks in sample 14, with label CAN
  189 peaks in sample 15, with label CAN
  189 peaks in sample 16, with label CAN
- 833 peaks were discarded (#samples < 3)
Sample coverage of peaks
  189 peaks in 3 samples
m/z range distribution
  [0.0,0.1[ : 1
  [0.1,0.2[ : 2
  [0.2,0.3[ : 6
  [0.3,0.4[ : 11
  [0.4,0.5[ : 47
  [0.5,0.6[ : 54
  [0.6,0.7[ : 37
  [0.7,0.8[ : 8
  [0.8,0.9[ : 12
  [0.9,1.0[ : 11
No peaks found with m/z range > 1.0
CAN - POSITIVO
------ Aligning spectra -------------
  Sample names: [['14'], ['15'], ['16']]
  Labels: ['CAN', 'CAN', 'CAN']
- Joining data... done, (total 3776 peaks in 3 spectra)
- Aligning... done, 3144 aligned peaks
Elapsed time: 00m 08.982s
Number of peaks: 200
  200 peaks in sample 14

- Aligning... done, 3189 aligned peaks
Elapsed time: 00m 09.070s
Number of peaks: 130
  130 peaks in sample 17, with label RIP
  130 peaks in sample 18, with label RIP
  130 peaks in sample 19, with label RIP
- 3059 peaks were discarded (#samples < 3)
Sample coverage of peaks
  130 peaks in 3 samples
m/z range distribution
  [0.0,0.1[ : 13
  [0.1,0.2[ : 25
  [0.2,0.3[ : 16
  [0.3,0.4[ : 18
  [0.4,0.5[ : 9
  [0.5,0.6[ : 11
  [0.6,0.7[ : 13
  [0.7,0.8[ : 15
  [0.8,0.9[ : 7
  [0.9,1.0[ : 3
No peaks found with m/z range > 1.0
RL - NEGATIVO
------ Aligning spectra -------------
  Sample names: [['26'], ['27'], ['28']]
  Labels: ['RL', 'RL', 'RL']
- Joining data... done, (total 2297 peaks in 3 spectra)
- Aligning... done, 1287 aligned peaks
Elapsed time: 00m 04.355s
Number of peaks: 355
  355 peaks in sample 26, with label RL
  355 peaks in sample 27, with label RL
  355 peaks in sample 28, with label RL
- 932 peaks were discarded (#samples < 3)
Sample coverage of peaks
  355 peaks in 3 samp

- Aligning... done, 356 aligned peaks
Elapsed time: 00m 01.134s
Number of peaks: 59
   59 peaks in sample 10713-1, with label CFN
   59 peaks in sample 10713-2, with label CFN
   59 peaks in sample 10713-3, with label CFN
- 297 peaks were discarded (#samples < 3)
Sample coverage of peaks
   59 peaks in 3 samples
m/z range distribution
  [0.0,0.1[ : 2
  [0.1,0.2[ : 1
  [0.2,0.3[ : 8
  [0.3,0.4[ : 11
  [0.4,0.5[ : 9
  [0.5,0.6[ : 7
  [0.6,0.7[ : 5
  [0.7,0.8[ : 6
  [0.8,0.9[ : 4
  [0.9,1.0[ : 6
No peaks found with m/z range > 1.0
CFN - POSITIVO
------ Aligning spectra -------------
  Sample names: [['10713-1'], ['10713-2'], ['10713-3']]
  Labels: ['CFN', 'CFN', 'CFN']
- Joining data... done, (total 3337 peaks in 3 spectra)
- Aligning... done, 2981 aligned peaks
Elapsed time: 00m 08.424s
Number of peaks: 93
   93 peaks in sample 10713-1, with label CFN
   93 peaks in sample 10713-2, with label CFN
   93 peaks in sample 10713-3, with label CFN
- 2888 peaks were discarded (#samples < 3)
Sam

In [5]:
aligned_pos = {name : value for name,value in aligned.items() if name.upper().endswith('POSITIVO')}
aligned_neg = {name : value for name,value in aligned.items() if name.upper().endswith('NEGATIVO')}

save_aligned_to_excel('aligned_cultivars_positive_1ppm_min3.xlsx', aligned_pos)
save_aligned_to_excel('aligned_cultivars_negative_1ppm_min3.xlsx', aligned_neg)

Created file
aligned_cultivars_positive_1ppm_min3.xlsx
Created file
aligned_cultivars_negative_1ppm_min3.xlsx


In [6]:
ppmtol = 1.0
min_samples = 3
positive = aligned_pos.values()
negative = aligned_neg.values()

aligned_all_pos = align(positive, ppmtol=ppmtol, min_samples=min_samples)
aligned_all_neg = align(negative, ppmtol=ppmtol, min_samples=min_samples)

------ Aligning spectra -------------
  Sample names: [['14', '15', '16'], ['29', '30', '31'], ['8', '9', '10'], ['23', '24', '25'], ['38', '39', '40'], ['17', '18', '19'], ['26', '27', '28'], ['20', '21', '22'], ['35', '36', '37'], ['32', '33', '34'], ['10713-1', '10713-2', '10713-3'], ['13514-1', '13514-2', '13514-3'], ['53211-1', '53211-2', '53211-3']]
  Labels: ['CAN', 'CAN', 'CAN', 'CS', 'CS', 'CS', 'LAB', 'LAB', 'LAB', 'PN', 'PN', 'PN', 'REG', 'REG', 'REG', 'RIP', 'RIP', 'RIP', 'RL', 'RL', 'RL', 'ROT', 'ROT', 'ROT', 'RU', 'RU', 'RU', 'TRI', 'TRI', 'TRI', 'CFN', 'CFN', 'CFN', 'CHT', 'CHT', 'CHT', 'SB', 'SB', 'SB']
- Joining data... done, (total 5112 peaks in 13 spectra)
- Aligning... done, 2091 aligned peaks
Elapsed time: 00m 08.208s
Number of peaks: 2091
  200 peaks in sample 14, with label CAN
  200 peaks in sample 15, with label CAN
  200 peaks in sample 16, with label CAN
  373 peaks in sample 29, with label CS
  373 peaks in sample 30, with label CS
  373 peaks in sample 31, 

In [7]:
outdict = {'POSITIVE': aligned_all_pos, 'NEGATIVE': aligned_all_neg}
save_aligned_to_excel('aligned_1ppm_min3_1ppm.xlsx', outdict)

aligned_all_pos.to_csv('aligned_1ppm_min3_1ppm_positive.csv', with_labels=True, sep=',')
aligned_all_neg.to_csv('aligned_1ppm_min3_1ppm_negative.csv', with_labels=True, sep=',')

Created file
aligned_1ppm_min3_1ppm.xlsx


In [8]:
ppmtol = 2.0
min_samples = 3
positive = aligned_pos.values()
negative = aligned_neg.values()

aligned_all_pos = align(positive, ppmtol=ppmtol, min_samples=min_samples)
aligned_all_neg = align(negative, ppmtol=ppmtol, min_samples=min_samples)

------ Aligning spectra -------------
  Sample names: [['14', '15', '16'], ['29', '30', '31'], ['8', '9', '10'], ['23', '24', '25'], ['38', '39', '40'], ['17', '18', '19'], ['26', '27', '28'], ['20', '21', '22'], ['35', '36', '37'], ['32', '33', '34'], ['10713-1', '10713-2', '10713-3'], ['13514-1', '13514-2', '13514-3'], ['53211-1', '53211-2', '53211-3']]
  Labels: ['CAN', 'CAN', 'CAN', 'CS', 'CS', 'CS', 'LAB', 'LAB', 'LAB', 'PN', 'PN', 'PN', 'REG', 'REG', 'REG', 'RIP', 'RIP', 'RIP', 'RL', 'RL', 'RL', 'ROT', 'ROT', 'ROT', 'RU', 'RU', 'RU', 'TRI', 'TRI', 'TRI', 'CFN', 'CFN', 'CFN', 'CHT', 'CHT', 'CHT', 'SB', 'SB', 'SB']
- Joining data... done, (total 5112 peaks in 13 spectra)
- Aligning... done, 1895 aligned peaks
Elapsed time: 00m 07.827s
Number of peaks: 1895
  200 peaks in sample 14, with label CAN
  200 peaks in sample 15, with label CAN
  200 peaks in sample 16, with label CAN
  373 peaks in sample 29, with label CS
  373 peaks in sample 30, with label CS
  373 peaks in sample 31, 

In [9]:
outdict = {'POSITIVE': aligned_all_pos, 'NEGATIVE': aligned_all_neg}
save_aligned_to_excel('aligned_1ppm_min3_2ppm.xlsx', outdict)

aligned_all_pos.to_csv('aligned_1ppm_min3_2ppm_positive.csv', with_labels=True, sep=',')
aligned_all_neg.to_csv('aligned_1ppm_min3_2ppm_negative.csv', with_labels=True, sep=',')

Created file
aligned_1ppm_min3_2ppm.xlsx
