## Alignment of peaks from several _Vitis_ cultivars

Alignment of peak lists based on m/z relative differences (below a ppm threshold). 

Requirements:

- metabolinks


In [1]:
import pandas as pd
from collections import OrderedDict
from pathlib import Path

from metabolinks import align, read_data_from_xcel
from metabolinks.similarity import mz_similarity
import metabolinks as mtl

## Set up a store to hold all alignments

In [2]:
alignments = pd.HDFStore('alignments.h5')
pd.set_option('io.hdf.default_format','table')

### Set up metadata descriptions

In [3]:
data_folder = 'data'
header_row = 3

data = {
    'CAN': {'filename': 'CAN (14, 15, 16).xlsx',
            'names'   : {'sample_names': '14 15 16'.split(), 'labels' : 'CAN'}},
    'CS':  {'filename': 'CS (29, 30, 31).xlsx',
            'names'   : {'sample_names': '29 30 31'.split(), 'labels' : 'CS'}},
    'LAB':  {'filename': 'LAB (8, 9, 10).xlsx',
            'names'   : {'sample_names': '8  9  10'.split(), 'labels' : 'LAB'}},
    'PN':  {'filename': 'PN (23, 24, 25).xlsx',
            'names'   : {'sample_names': '23 24 25'.split(), 'labels' : 'PN'}},
    'REG':  {'filename': 'REG (38, 39, 40).xlsx',
            'names'   : {'sample_names': '38 39 40'.split(), 'labels' : 'REG'}},
    'RIP':  {'filename': 'RIP (17, 18, 19).xlsx',
            'names'   : {'sample_names': '17 18 19'.split(), 'labels' : 'RIP'}},
    'RL':  {'filename': 'RL (26, 27, 28).xlsx',
            'names'   : {'sample_names': '26 27 28'.split(), 'labels' : 'RL'}},
    'ROT':  {'filename': 'ROT (20, 21, 22).xlsx',
            'names'   : {'sample_names': '20 21 22'.split(), 'labels' : 'ROT'}},
    'RU':  {'filename': 'RU (35, 36, 37).xlsx',
            'names'   : {'sample_names': '35 36 37'.split(), 'labels' : 'RU'}},
    'SYL':  {'filename': 'SYL (11, 12, 13).xlsx',
            'names'   : {'sample_names': '11 12 13'.split(), 'labels' : 'SYL'}},
    'TRI':  {'filename': 'TRI (32, 33, 34).xlsx',
            'names'   : {'sample_names': '32 33 34'.split(), 'labels' : 'TRI'}},
    # these are the new cultivars
    'CFN':  {'filename': 'CFN (10713_1, 10713_2, 10713_3).xlsx',
            'names'   : {'sample_names': '10713-1 10713-2 10713-3'.split(), 'labels' : 'CFN'}},
    'CHT':  {'filename': 'CHT (13514_1, 13514_2, 13514_3).xlsx',
            'names'   : {'sample_names': '13514-1 13514-2 13514-3'.split(), 'labels' : 'CHT'}},
    'SB':  {'filename': 'SB (53211_1, 53211_2, 53211_3).xlsx',
            'names'   : {'sample_names': '53211-1 53211-2 53211-3'.split(), 'labels' : 'SB'}},
}

### Read spectra from Excel files

In [5]:
def read_vitis_data(filename, metadata):
    exp=read_data_from_xcel(filename, header=[3])
    for sname in exp:
        dfs = exp[sname]
        label2assign = metadata['names']['labels']
        for name, df in zip(metadata['names']['sample_names'], dfs):
            df.columns = [name]
            df.index.name = 'm/z'
        exp[sname] = [mtl.add_labels(df, labels=label2assign) for df in exp[sname]]
    return exp
exp = read_vitis_data(f"data/{data['CAN']['filename']}", data['CAN'])
#exp # seems ok!

In [6]:
all_spectra = OrderedDict()

for d, desc in data.items():
    fpath = Path(data_folder, desc['filename'])
    sheets = read_vitis_data(fpath, desc)
    for sheet, spectra in sheets.items():
        print(f'Sheet {sheet} contains {len(spectra)} spectra')
        all_spectra[sheet] = spectra

Sheet CAN - NEGATIVO contains 3 spectra
Sheet CAN - POSITIVO contains 3 spectra
Sheet CS - NEGATIVO contains 3 spectra
Sheet CS - Positivo contains 3 spectra
Sheet LAB - NEGATIVO contains 3 spectra
Sheet LAB - POSITIVO contains 3 spectra
Sheet PN - NEGATIVO contains 3 spectra
Sheet PN - POSITIVO contains 3 spectra
Sheet REG - NEGATIVO contains 3 spectra
Sheet REG - POSITIVO contains 3 spectra
Sheet RIP - NEGATIVO contains 3 spectra
Sheet RIP - POSITIVO contains 3 spectra
Sheet RL - NEGATIVO contains 3 spectra
Sheet RL - POSITIVO contains 3 spectra
Sheet ROT - NEGATIVO contains 3 spectra
Sheet ROT - POSITIVO contains 3 spectra
Sheet RU - NEGATIVO contains 3 spectra
Sheet RU - POSITIVO contains 3 spectra
Sheet SYL - NEGATIVE contains 3 spectra
Sheet SYL - POSITIVE contains 3 spectra
Sheet TRI - NEGATIVO contains 3 spectra
Sheet TRI - POSITIVO contains 3 spectra
Sheet CFN - NEGATIVO contains 3 spectra
Sheet CFN - POSITIVO contains 3 spectra
Sheet CHT - NEGATIVO contains 3 spectra
Sheet CH

### Alignment of peak lists

#### Align for each mode and cultivar (keep if peak appears in at least 2 samples)

In [7]:
ppmtol = 1.0
min_samples = 2

aligned = {}
for k, s in all_spectra.items():
    print('=======================================')
    print(k)
    # print(s)
    aligned[k]  = align(s, ppmtol, min_samples)

CAN - NEGATIVO
------ Aligning tables -------------
 Samples to align: [[('CAN', '14')], [('CAN', '15')], [('CAN', '16')]]
- Extracting all features...
  Done, (total 1686 features in 3 samples)
- Grouping and joining...
  Done, 1022 groups found
Elapsed time: 00m 00.553s

- 547 groups were discarded (#samples < 2)
Sample coverage of features
  286 features in 2 samples
  189 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 17
  [0.1,0.2[ : 19
  [0.2,0.3[ : 25
  [0.3,0.4[ : 45
  [0.4,0.5[ : 76
  [0.5,0.6[ : 119
  [0.6,0.7[ : 81
  [0.7,0.8[ : 33
  [0.8,0.9[ : 30
  [0.9,1.0[ : 30
  > 1.0     : 0
CAN - POSITIVO
------ Aligning tables -------------
 Samples to align: [[('CAN', '14')], [('CAN', '15')], [('CAN', '16')]]
- Extracting all features...
  Done, (total 3776 features in 3 samples)
- Grouping and joining...
  Done, 3144 groups found
Elapsed time: 00m 01.609s

- 2712 groups were discarded (#samples < 2)
Sample coverage of features
  232 features in 2 samples
  200 fea

  Done, 3808 groups found
Elapsed time: 00m 01.968s

- 2941 groups were discarded (#samples < 2)
Sample coverage of features
  460 features in 2 samples
  407 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 98
  [0.1,0.2[ : 110
  [0.2,0.3[ : 109
  [0.3,0.4[ : 73
  [0.4,0.5[ : 86
  [0.5,0.6[ : 81
  [0.6,0.7[ : 77
  [0.7,0.8[ : 88
  [0.8,0.9[ : 75
  [0.9,1.0[ : 70
  > 1.0     : 0
ROT - NEGATIVO
------ Aligning tables -------------
 Samples to align: [[('ROT', '20')], [('ROT', '21')], [('ROT', '22')]]
- Extracting all features...
  Done, (total 1809 features in 3 samples)
- Grouping and joining...
  Done, 946 groups found
Elapsed time: 00m 00.562s

- 439 groups were discarded (#samples < 2)
Sample coverage of features
  151 features in 2 samples
  356 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 38
  [0.1,0.2[ : 76
  [0.2,0.3[ : 103
  [0.3,0.4[ : 111
  [0.4,0.5[ : 29
  [0.5,0.6[ : 38
  [0.6,0.7[ : 45
  [0.7,0.8[ : 25
  [0.8,0.9[ : 21
  [0.9,1.0[ : 21
 

  Done, 823 groups found
Elapsed time: 00m 00.437s

- 633 groups were discarded (#samples < 2)
Sample coverage of features
   98 features in 2 samples
   92 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 15
  [0.1,0.2[ : 24
  [0.2,0.3[ : 31
  [0.3,0.4[ : 23
  [0.4,0.5[ : 15
  [0.5,0.6[ : 21
  [0.6,0.7[ : 14
  [0.7,0.8[ : 17
  [0.8,0.9[ : 13
  [0.9,1.0[ : 17
  > 1.0     : 0
SB - POSITIVO
------ Aligning tables -------------
 Samples to align: [[('SB', '53211-1')], [('SB', '53211-2')], [('SB', '53211-3')]]
- Extracting all features...
  Done, (total 3183 features in 3 samples)
- Grouping and joining...
  Done, 2911 groups found
Elapsed time: 00m 01.422s

- 2710 groups were discarded (#samples < 2)
Sample coverage of features
  130 features in 2 samples
   71 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 14
  [0.1,0.2[ : 16
  [0.2,0.3[ : 15
  [0.3,0.4[ : 29
  [0.4,0.5[ : 18
  [0.5,0.6[ : 19
  [0.6,0.7[ : 27
  [0.7,0.8[ : 26
  [0.8,0.9[ : 12
  [0.9,1.0[

#### Separate modes

In [8]:
aligned_pos = {name : value for name,value in aligned.items() if name.upper()[-8:-1]=='POSITIV'}
aligned_neg = {name : value for name,value in aligned.items() if name.upper()[-8:-1]=='NEGATIV'}

#save_aligned_to_excel('aligned_cultivars_positive_1ppm_min2.xlsx', aligned_pos)
#save_aligned_to_excel('aligned_cultivars_negative_1ppm_min2.xlsx', aligned_neg)

#### Align globally the previously obtained alignments (for each mode).

In [9]:
ppmtol = 1.0
min_samples = 1 #Now it has to be 1
positive = aligned_pos.values()
negative = aligned_neg.values()

aligned_all_pos = align(positive, ppmtol=ppmtol, min_samples=min_samples)
aligned_all_neg = align(negative, ppmtol=ppmtol, min_samples=min_samples)

------ Aligning tables -------------
 Samples to align: [[('CAN', '14'), ('CAN', '15'), ('CAN', '16')], [('CS', '29'), ('CS', '30'), ('CS', '31')], [('LAB', '8'), ('LAB', '9'), ('LAB', '10')], [('PN', '23'), ('PN', '24'), ('PN', '25')], [('REG', '38'), ('REG', '39'), ('REG', '40')], [('RIP', '17'), ('RIP', '18'), ('RIP', '19')], [('RL', '26'), ('RL', '27'), ('RL', '28')], [('ROT', '20'), ('ROT', '21'), ('ROT', '22')], [('RU', '35'), ('RU', '36'), ('RU', '37')], [('SYL', '11'), ('SYL', '12'), ('SYL', '13')], [('TRI', '32'), ('TRI', '33'), ('TRI', '34')], [('CFN', '10713-1'), ('CFN', '10713-2'), ('CFN', '10713-3')], [('CHT', '13514-1'), ('CHT', '13514-2'), ('CHT', '13514-3')], [('SB', '53211-1'), ('SB', '53211-2'), ('SB', '53211-3')]]
- Extracting all features...
  Done, (total 11426 features in 14 samples)
- Grouping and joining...
  Done, 5225 groups found
Elapsed time: 00m 02.983s

Sample coverage of features
 2902 features in 1 samples
  930 features in 2 samples
  567 features in 3 

### Test hdf5 store (writting and reading back, using `put` and `get`)

Other functions are `df.to_hdf(store)` and `store.append(key, df)`

In [14]:
alignments.put('groups_1ppm_min2_all_1ppm_pos', aligned_all_pos)
# Nomenclature: first groups at 1ppm then all at 1ppm

In [15]:
#alignments.keys()
# it seems to work
bigalignment = alignments.get('groups_1ppm_min2_all_1ppm_pos')
bigalignment.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 5225 entries, 97.03106 to 907.35394
Data columns (total 42 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   (CAN, 14)       330 non-null    float64
 1   (CAN, 15)       379 non-null    float64
 2   (CAN, 16)       355 non-null    float64
 3   (CS, 29)        667 non-null    float64
 4   (CS, 30)        612 non-null    float64
 5   (CS, 31)        624 non-null    float64
 6   (LAB, 8)        489 non-null    float64
 7   (LAB, 9)        501 non-null    float64
 8   (LAB, 10)       471 non-null    float64
 9   (PN, 23)        1487 non-null   float64
 10  (PN, 24)        1609 non-null   float64
 11  (PN, 25)        1458 non-null   float64
 12  (REG, 38)       1577 non-null   float64
 13  (REG, 39)       1620 non-null   float64
 14  (REG, 40)       1580 non-null   float64
 15  (RIP, 17)       307 non-null    float64
 16  (RIP, 18)       291 non-null    float64
 17  (RIP, 19)       208

#### Save alignments in Excel and CSV files

In [None]:
outdict = {'POSITIVE': aligned_all_pos, 'NEGATIVE': aligned_all_neg}
#save_aligned_to_excel('aligned_1ppm_min2_1ppm.xlsx', outdict)

#aligned_all_pos.to_csv('aligned_1ppm_min2_1ppm_positive.csv', with_labels=True, sep=',')
#aligned_all_neg.to_csv('aligned_1ppm_min2_1ppm_negative.csv', with_labels=True, sep=',')

### REPEAT alignments, this time requiring presence of a peak in all replicas within each label

In [None]:
ppmtol = 1.0
min_samples = 3

aligned = {}
for k, s in all_spectra.items():
    print('=======================================')
    print(k)
    aligned[k]  = align(s, ppmtol, min_samples)

In [None]:
#aligned_pos = {name : value for name,value in aligned.items() if name.upper().endswith('POSITIVO')}
#aligned_neg = {name : value for name,value in aligned.items() if name.upper().endswith('NEGATIVO')}
aligned_pos = {name : value for name,value in aligned.items() if name.upper()[-8:-1]=='POSITIV'}
aligned_neg = {name : value for name,value in aligned.items() if name.upper()[-8:-1]=='NEGATIV'}

#save_aligned_to_excel('aligned_cultivars_positive_1ppm_min3.xlsx', aligned_pos)
#save_aligned_to_excel('aligned_cultivars_negative_1ppm_min3.xlsx', aligned_neg)

To be continued ...

In [None]:
ppmtol = 1.0
min_samples = 1 #Now it has to be 1
positive = aligned_pos.values()
negative = aligned_neg.values()

aligned_all_pos = align(positive, ppmtol=ppmtol, min_samples=min_samples)
aligned_all_neg = align(negative, ppmtol=ppmtol, min_samples=min_samples)

In [None]:
outdict = {'POSITIVE': aligned_all_pos, 'NEGATIVE': aligned_all_neg}
#save_aligned_to_excel('aligned_1ppm_min3_1ppm.xlsx', outdict)

#aligned_all_pos.to_csv('aligned_1ppm_min3_1ppm_positive.csv', with_labels=True, sep=',')
#aligned_all_neg.to_csv('aligned_1ppm_min3_1ppm_negative.csv', with_labels=True, sep=',')

In [None]:
ppmtol = 2.0
min_samples = 3
positive = aligned_pos.values()
negative = aligned_neg.values()

aligned_all_pos = align(positive, ppmtol=ppmtol, min_samples=min_samples)
aligned_all_neg = align(negative, ppmtol=ppmtol, min_samples=min_samples)

In [None]:
outdict = {'POSITIVE': aligned_all_pos, 'NEGATIVE': aligned_all_neg}
#save_aligned_to_excel('aligned_1ppm_min3_2ppm.xlsx', outdict)

#aligned_all_pos.to_csv('aligned_1ppm_min3_2ppm_positive.csv', with_labels=True, sep=',')
#aligned_all_neg.to_csv('aligned_1ppm_min3_2ppm_negative.csv', with_labels=True, sep=',')

### Aligning all 39 samples together (not aligning replicates first)

First, putting all samples in the same list with the correct sample_names

In [None]:
posi = []
nega = []
for k, s in all_spectra.items():
    if k.upper().endswith('POSITIVO'):
        s[0].columns = [data[k.split()[0]]['names']['sample_names'][0]]
        s[1].columns = [data[k.split()[0]]['names']['sample_names'][1]]
        s[2].columns = [data[k.split()[0]]['names']['sample_names'][2]]
        posi.append(s[0])
        posi.append(s[1])
        posi.append(s[2])
    if k.upper().endswith('NEGATIVO'):
        s[0].columns = [data[k.split()[0]]['names']['sample_names'][0]]
        s[1].columns = [data[k.split()[0]]['names']['sample_names'][1]]
        s[2].columns = [data[k.split()[0]]['names']['sample_names'][2]]
        nega.append(s[0])
        nega.append(s[1])
        nega.append(s[2])

In [None]:
ppmtol = 1.0
min_samples = 2

aligned_all_positive = align(posi, ppmtol, min_samples)
aligned_all_negative = align(nega, ppmtol, min_samples)

In [None]:
aligned_all_positive = mtl.add_labels(aligned_all_positive,
                                      ['CAN','CS','LAB','PN','REG','RIP','RL','ROT','RU','TRI','CFN','CHT','SB'])
aligned_all_negative = mtl.add_labels(aligned_all_negative,
                                      ['CAN','CS','LAB','PN','REG','RIP','RL','ROT','RU','TRI','CFN','CHT','SB'])

In [None]:
#aligned_all_positive.to_csv('aligned_1ppm_min1-2_1ppm_positive.csv', with_labels=True, sep=',')
#aligned_all_negative.to_csv('aligned_1ppm_min1-2_1ppm_negative.csv', with_labels=True, sep=',')

### Comparing with the same alignment made with the previous version of align

In [None]:
def read_aligned_files(filename):
    """Short function to read the aligned files fast while putting the MultiIndex in the correct order for the CDL accessor."""
    df = pd.read_csv(filename, header = None, index_col = [0])
    df.index.name = 'm/z'
    mi = pd.concat([df.iloc[1, :],df.iloc[0, :]], axis = 'columns')
    mi = pd.MultiIndex.from_frame(mi)
    final_file = pd.read_csv(filename, header = [0,1], index_col = [0])
    final_file.columns = mi
    return final_file
import pandas as pd

In [None]:
aligned_all_pos2 = read_aligned_files('aligned_1ppm_min1-2_1ppm_positive.csv')
aligned_all_neg2 = read_aligned_files('aligned_1ppm_min1-2_1ppm_negative.csv')

In [None]:
aligned_all_negative.columns.names = ['Label','Sample']

In [None]:
from pandas.testing import assert_frame_equal
assert_frame_equal(aligned_all_negative, aligned_all_neg2)

### It gives the same results, should it?