## Alignment of peaks from several _Vitis_ cultivars

Alignment of peak lists based on m/z relative differences (below a ppm threshold). 

Requirements:

- metabolinks


In [1]:
import pandas as pd
from collections import OrderedDict
from pathlib import Path

from metabolinks import align, read_data_from_xcel
from metabolinks.similarity import mz_similarity
import metabolinks as mtl

## Set up a store to hold all alignments

In [2]:
alignments = pd.HDFStore('alignments.h5')
pd.set_option('io.hdf.default_format','table')

In [3]:
alignments.keys()

['/all_1ppm_min13_neg',
 '/all_1ppm_min13_pos',
 '/all_1ppm_min2_neg',
 '/all_1ppm_min2_pos',
 '/all_1ppm_min6_neg',
 '/all_1ppm_min6_pos',
 '/groups_1ppm_min2_all_1ppm_neg',
 '/groups_1ppm_min2_all_1ppm_pos',
 '/groups_1ppm_min3_all_1ppm_neg',
 '/groups_1ppm_min3_all_1ppm_pos',
 '/groups_2ppm_min3_all_2ppm_neg',
 '/groups_2ppm_min3_all_2ppm_pos']

### Set up metadata descriptions

In [4]:
data_folder = 'data'
header_row = 3

data = {
    'CAN': {'filename': 'CAN (14, 15, 16).xlsx',
            'names'   : {'sample_names': '14 15 16'.split(), 'labels' : 'CAN'}},
    'CS':  {'filename': 'CS (29, 30, 31).xlsx',
            'names'   : {'sample_names': '29 30 31'.split(), 'labels' : 'CS'}},
    'LAB':  {'filename': 'LAB (8, 9, 10).xlsx',
            'names'   : {'sample_names': '8  9  10'.split(), 'labels' : 'LAB'}},
    'PN':  {'filename': 'PN (23, 24, 25).xlsx',
            'names'   : {'sample_names': '23 24 25'.split(), 'labels' : 'PN'}},
    'REG':  {'filename': 'REG (38, 39, 40).xlsx',
            'names'   : {'sample_names': '38 39 40'.split(), 'labels' : 'REG'}},
    'RIP':  {'filename': 'RIP (17, 18, 19).xlsx',
            'names'   : {'sample_names': '17 18 19'.split(), 'labels' : 'RIP'}},
    'RL':  {'filename': 'RL (26, 27, 28).xlsx',
            'names'   : {'sample_names': '26 27 28'.split(), 'labels' : 'RL'}},
    'ROT':  {'filename': 'ROT (20, 21, 22).xlsx',
            'names'   : {'sample_names': '20 21 22'.split(), 'labels' : 'ROT'}},
    'RU':  {'filename': 'RU (35, 36, 37).xlsx',
            'names'   : {'sample_names': '35 36 37'.split(), 'labels' : 'RU'}},
    'SYL':  {'filename': 'SYL (11, 12, 13).xlsx',
            'names'   : {'sample_names': '11 12 13'.split(), 'labels' : 'SYL'}},
    'TRI':  {'filename': 'TRI (32, 33, 34).xlsx',
            'names'   : {'sample_names': '32 33 34'.split(), 'labels' : 'TRI'}},
    # these are the new cultivars
    'CFN':  {'filename': 'CFN (10713_1, 10713_2, 10713_3).xlsx',
            'names'   : {'sample_names': '10713-1 10713-2 10713-3'.split(), 'labels' : 'CFN'}},
    'CHT':  {'filename': 'CHT (13514_1, 13514_2, 13514_3).xlsx',
            'names'   : {'sample_names': '13514-1 13514-2 13514-3'.split(), 'labels' : 'CHT'}},
    'SB':  {'filename': 'SB (53211_1, 53211_2, 53211_3).xlsx',
            'names'   : {'sample_names': '53211-1 53211-2 53211-3'.split(), 'labels' : 'SB'}},
}

### Read spectra from Excel files

In [5]:
def read_vitis_data(filename, metadata):
    exp=read_data_from_xcel(filename, header=[3])
    for sname in exp:
        dfs = exp[sname]
        label2assign = metadata['names']['labels']
        for name, df in zip(metadata['names']['sample_names'], dfs):
            df.columns = [name]
            df.index.name = 'm/z'
        exp[sname] = [mtl.add_labels(df, labels=label2assign) for df in exp[sname]]
    return exp
exp = read_vitis_data(f"data/{data['CAN']['filename']}", data['CAN'])
#exp # seems ok!

In [6]:
all_spectra = OrderedDict()

for d, desc in data.items():
    fpath = Path(data_folder, desc['filename'])
    sheets = read_vitis_data(fpath, desc)
    for sheet, spectra in sheets.items():
        print(f'Sheet {sheet} contains {len(spectra)} spectra')
        all_spectra[sheet] = spectra

Sheet CAN - NEGATIVO contains 3 spectra
Sheet CAN - POSITIVO contains 3 spectra
Sheet CS - NEGATIVO contains 3 spectra
Sheet CS - Positivo contains 3 spectra
Sheet LAB - NEGATIVO contains 3 spectra
Sheet LAB - POSITIVO contains 3 spectra
Sheet PN - NEGATIVO contains 3 spectra
Sheet PN - POSITIVO contains 3 spectra
Sheet REG - NEGATIVO contains 3 spectra
Sheet REG - POSITIVO contains 3 spectra
Sheet RIP - NEGATIVO contains 3 spectra
Sheet RIP - POSITIVO contains 3 spectra
Sheet RL - NEGATIVO contains 3 spectra
Sheet RL - POSITIVO contains 3 spectra
Sheet ROT - NEGATIVO contains 3 spectra
Sheet ROT - POSITIVO contains 3 spectra
Sheet RU - NEGATIVO contains 3 spectra
Sheet RU - POSITIVO contains 3 spectra
Sheet SYL - NEGATIVE contains 3 spectra
Sheet SYL - POSITIVE contains 3 spectra
Sheet TRI - NEGATIVO contains 3 spectra
Sheet TRI - POSITIVO contains 3 spectra
Sheet CFN - NEGATIVO contains 3 spectra
Sheet CFN - POSITIVO contains 3 spectra
Sheet CHT - NEGATIVO contains 3 spectra
Sheet CH

### Alignment of peak lists

#### Align for each mode and cultivar (keep if peak appears in at least 2 samples)

In [7]:
ppmtol = 1.0 #2.0 for the groups_2ppm_min3_all_2ppm_neg/pos; 1 for the rest.
min_samples = 2 #2 for the groups_1ppm_min2_all_1ppm_neg/pos, 3 for the groups_1ppm_min3_all_1ppm_neg/pos and 
                #groups_2ppm_min3_all_2ppm_neg/pos

aligned = {}
for k, s in all_spectra.items():
    print('=======================================')
    print(k)
    # print(s)
    aligned[k]  = align(s, ppmtol, min_samples)

CAN - NEGATIVO
------ Aligning tables -------------
 Samples to align: [[('CAN', '14')], [('CAN', '15')], [('CAN', '16')]]
- Extracting all features...
  Done, (total 1686 features in 3 samples)
- Grouping and joining...
  Done, 1022 groups found
Elapsed time: 00m 00.612s

- 547 groups were discarded (#samples < 2)
Sample coverage of features
  286 features in 2 samples
  189 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 17
  [0.1,0.2[ : 19
  [0.2,0.3[ : 25
  [0.3,0.4[ : 45
  [0.4,0.5[ : 76
  [0.5,0.6[ : 119
  [0.6,0.7[ : 81
  [0.7,0.8[ : 33
  [0.8,0.9[ : 30
  [0.9,1.0[ : 30
  > 1.0     : 0
CAN - POSITIVO
------ Aligning tables -------------
 Samples to align: [[('CAN', '14')], [('CAN', '15')], [('CAN', '16')]]
- Extracting all features...
  Done, (total 3776 features in 3 samples)
- Grouping and joining...
  Done, 3144 groups found
Elapsed time: 00m 01.712s

- 2712 groups were discarded (#samples < 2)
Sample coverage of features
  232 features in 2 samples
  200 fea

  Done, 3808 groups found
Elapsed time: 00m 02.101s

- 2941 groups were discarded (#samples < 2)
Sample coverage of features
  460 features in 2 samples
  407 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 98
  [0.1,0.2[ : 110
  [0.2,0.3[ : 109
  [0.3,0.4[ : 73
  [0.4,0.5[ : 86
  [0.5,0.6[ : 81
  [0.6,0.7[ : 77
  [0.7,0.8[ : 88
  [0.8,0.9[ : 75
  [0.9,1.0[ : 70
  > 1.0     : 0
ROT - NEGATIVO
------ Aligning tables -------------
 Samples to align: [[('ROT', '20')], [('ROT', '21')], [('ROT', '22')]]
- Extracting all features...
  Done, (total 1809 features in 3 samples)
- Grouping and joining...
  Done, 946 groups found
Elapsed time: 00m 00.609s

- 439 groups were discarded (#samples < 2)
Sample coverage of features
  151 features in 2 samples
  356 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 38
  [0.1,0.2[ : 76
  [0.2,0.3[ : 103
  [0.3,0.4[ : 111
  [0.4,0.5[ : 29
  [0.5,0.6[ : 38
  [0.6,0.7[ : 45
  [0.7,0.8[ : 25
  [0.8,0.9[ : 21
  [0.9,1.0[ : 21
 

  Done, 823 groups found
Elapsed time: 00m 00.482s

- 633 groups were discarded (#samples < 2)
Sample coverage of features
   98 features in 2 samples
   92 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 15
  [0.1,0.2[ : 24
  [0.2,0.3[ : 31
  [0.3,0.4[ : 23
  [0.4,0.5[ : 15
  [0.5,0.6[ : 21
  [0.6,0.7[ : 14
  [0.7,0.8[ : 17
  [0.8,0.9[ : 13
  [0.9,1.0[ : 17
  > 1.0     : 0
SB - POSITIVO
------ Aligning tables -------------
 Samples to align: [[('SB', '53211-1')], [('SB', '53211-2')], [('SB', '53211-3')]]
- Extracting all features...
  Done, (total 3183 features in 3 samples)
- Grouping and joining...
  Done, 2911 groups found
Elapsed time: 00m 01.548s

- 2710 groups were discarded (#samples < 2)
Sample coverage of features
  130 features in 2 samples
   71 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 14
  [0.1,0.2[ : 16
  [0.2,0.3[ : 15
  [0.3,0.4[ : 29
  [0.4,0.5[ : 18
  [0.5,0.6[ : 19
  [0.6,0.7[ : 27
  [0.7,0.8[ : 26
  [0.8,0.9[ : 12
  [0.9,1.0[

#### Separate modes

In [8]:
aligned_pos = {name : value for name,value in aligned.items() if name.upper()[-8:-1]=='POSITIV'}
aligned_neg = {name : value for name,value in aligned.items() if name.upper()[-8:-1]=='NEGATIV'}

#save_aligned_to_excel('aligned_cultivars_positive_1ppm_min2.xlsx', aligned_pos)
#save_aligned_to_excel('aligned_cultivars_negative_1ppm_min2.xlsx', aligned_neg)

#### Align globally the previously obtained alignments (for each mode).

In [9]:
ppmtol = 1.0 #2.0 for the groups_2ppm_min3_all_2ppm_neg/pos
min_samples = 1 #Now it has to be 1
positive = aligned_pos.values()
negative = aligned_neg.values()

aligned_all_pos = align(positive, ppmtol=ppmtol, min_samples=min_samples)
aligned_all_neg = align(negative, ppmtol=ppmtol, min_samples=min_samples)

------ Aligning tables -------------
 Samples to align: [[('CAN', '14'), ('CAN', '15'), ('CAN', '16')], [('CS', '29'), ('CS', '30'), ('CS', '31')], [('LAB', '8'), ('LAB', '9'), ('LAB', '10')], [('PN', '23'), ('PN', '24'), ('PN', '25')], [('REG', '38'), ('REG', '39'), ('REG', '40')], [('RIP', '17'), ('RIP', '18'), ('RIP', '19')], [('RL', '26'), ('RL', '27'), ('RL', '28')], [('ROT', '20'), ('ROT', '21'), ('ROT', '22')], [('RU', '35'), ('RU', '36'), ('RU', '37')], [('SYL', '11'), ('SYL', '12'), ('SYL', '13')], [('TRI', '32'), ('TRI', '33'), ('TRI', '34')], [('CFN', '10713-1'), ('CFN', '10713-2'), ('CFN', '10713-3')], [('CHT', '13514-1'), ('CHT', '13514-2'), ('CHT', '13514-3')], [('SB', '53211-1'), ('SB', '53211-2'), ('SB', '53211-3')]]
- Extracting all features...
  Done, (total 11426 features in 14 samples)
- Grouping and joining...
  Done, 5225 groups found
Elapsed time: 00m 03.214s

Sample coverage of features
 2902 features in 1 samples
  930 features in 2 samples
  567 features in 3 

### Test hdf5 store (writting and reading back, using `put` and `get`)

Other functions are `df.to_hdf(store)` and `store.append(key, df)`

In [10]:
alignments.put('groups_1ppm_min2_all_1ppm_pos', aligned_all_pos)
# Nomenclature: first groups at 1ppm then all at 1ppm

In [11]:
alignments.put('groups_1ppm_min2_all_1ppm_neg', aligned_all_neg)
# Nomenclature: first groups at 1ppm then all at 1ppm

In [12]:
#alignments.keys()
# it seems to work
bigalignment = alignments.get('groups_1ppm_min2_all_1ppm_neg')
bigalignment.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 3369 entries, 97.58869 to 977.1140350000001
Data columns (total 42 columns):
(CAN, 14)         442 non-null float64
(CAN, 15)         319 non-null float64
(CAN, 16)         378 non-null float64
(CS, 29)          674 non-null float64
(CS, 30)          677 non-null float64
(CS, 31)          628 non-null float64
(LAB, 8)          370 non-null float64
(LAB, 9)          402 non-null float64
(LAB, 10)         410 non-null float64
(PN, 23)          350 non-null float64
(PN, 24)          376 non-null float64
(PN, 25)          388 non-null float64
(REG, 38)         594 non-null float64
(REG, 39)         896 non-null float64
(REG, 40)         915 non-null float64
(RIP, 17)         462 non-null float64
(RIP, 18)         446 non-null float64
(RIP, 19)         469 non-null float64
(RL, 26)          586 non-null float64
(RL, 27)          487 non-null float64
(RL, 28)          592 non-null float64
(ROT, 20)         469 non-null float64
(ROT, 21)    

#### Save alignments in Excel and CSV files

In [13]:
outdict = {'POSITIVE': aligned_all_pos, 'NEGATIVE': aligned_all_neg}
#save_aligned_to_excel('aligned_1ppm_min2_1ppm.xlsx', outdict)

#aligned_all_pos.to_csv('aligned_1ppm_min2_1ppm_positive.csv', with_labels=True, sep=',')
#aligned_all_neg.to_csv('aligned_1ppm_min2_1ppm_negative.csv', with_labels=True, sep=',')

### REPEAT alignments, this time requiring presence of a peak in all replicas within each label

In [14]:
ppmtol = 1.0
min_samples = 3

aligned = {}
for k, s in all_spectra.items():
    print('=======================================')
    print(k)
    aligned[k]  = align(s, ppmtol, min_samples)

CAN - NEGATIVO
------ Aligning tables -------------
 Samples to align: [[('CAN', '14')], [('CAN', '15')], [('CAN', '16')]]
- Extracting all features...
  Done, (total 1686 features in 3 samples)
- Grouping and joining...
  Done, 1022 groups found
Elapsed time: 00m 00.608s

- 833 groups were discarded (#samples < 3)
Sample coverage of features
  189 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 1
  [0.1,0.2[ : 2
  [0.2,0.3[ : 6
  [0.3,0.4[ : 11
  [0.4,0.5[ : 47
  [0.5,0.6[ : 54
  [0.6,0.7[ : 37
  [0.7,0.8[ : 8
  [0.8,0.9[ : 12
  [0.9,1.0[ : 11
  > 1.0     : 0
CAN - POSITIVO
------ Aligning tables -------------
 Samples to align: [[('CAN', '14')], [('CAN', '15')], [('CAN', '16')]]
- Extracting all features...
  Done, (total 3776 features in 3 samples)
- Grouping and joining...
  Done, 3144 groups found
Elapsed time: 00m 01.689s

- 2944 groups were discarded (#samples < 3)
Sample coverage of features
  200 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ :

  Done, 3808 groups found
Elapsed time: 00m 02.091s

- 3401 groups were discarded (#samples < 3)
Sample coverage of features
  407 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 40
  [0.1,0.2[ : 51
  [0.2,0.3[ : 55
  [0.3,0.4[ : 42
  [0.4,0.5[ : 47
  [0.5,0.6[ : 39
  [0.6,0.7[ : 33
  [0.7,0.8[ : 31
  [0.8,0.9[ : 37
  [0.9,1.0[ : 32
  > 1.0     : 0
ROT - NEGATIVO
------ Aligning tables -------------
 Samples to align: [[('ROT', '20')], [('ROT', '21')], [('ROT', '22')]]
- Extracting all features...
  Done, (total 1809 features in 3 samples)
- Grouping and joining...
  Done, 946 groups found
Elapsed time: 00m 00.583s

- 590 groups were discarded (#samples < 3)
Sample coverage of features
  356 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 15
  [0.1,0.2[ : 48
  [0.2,0.3[ : 79
  [0.3,0.4[ : 94
  [0.4,0.5[ : 20
  [0.5,0.6[ : 29
  [0.6,0.7[ : 29
  [0.7,0.8[ : 17
  [0.8,0.9[ : 14
  [0.9,1.0[ : 11
  > 1.0     : 0
ROT - POSITIVO
------ Aligning tables -------

  Done, 823 groups found
Elapsed time: 00m 00.465s

- 731 groups were discarded (#samples < 3)
Sample coverage of features
   92 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 5
  [0.1,0.2[ : 10
  [0.2,0.3[ : 17
  [0.3,0.4[ : 16
  [0.4,0.5[ : 7
  [0.5,0.6[ : 9
  [0.6,0.7[ : 6
  [0.7,0.8[ : 11
  [0.8,0.9[ : 3
  [0.9,1.0[ : 8
  > 1.0     : 0
SB - POSITIVO
------ Aligning tables -------------
 Samples to align: [[('SB', '53211-1')], [('SB', '53211-2')], [('SB', '53211-3')]]
- Extracting all features...
  Done, (total 3183 features in 3 samples)
- Grouping and joining...
  Done, 2911 groups found
Elapsed time: 00m 01.541s

- 2840 groups were discarded (#samples < 3)
Sample coverage of features
   71 features in 3 samples
m/z range (ppm) distribution
  [0.0,0.1[ : 4
  [0.1,0.2[ : 4
  [0.2,0.3[ : 5
  [0.3,0.4[ : 10
  [0.4,0.5[ : 5
  [0.5,0.6[ : 9
  [0.6,0.7[ : 8
  [0.7,0.8[ : 9
  [0.8,0.9[ : 5
  [0.9,1.0[ : 12
  > 1.0     : 0


In [15]:
#aligned_pos = {name : value for name,value in aligned.items() if name.upper().endswith('POSITIVO')}
#aligned_neg = {name : value for name,value in aligned.items() if name.upper().endswith('NEGATIVO')}
aligned_pos = {name : value for name,value in aligned.items() if name.upper()[-8:-1]=='POSITIV'}
aligned_neg = {name : value for name,value in aligned.items() if name.upper()[-8:-1]=='NEGATIV'}

#save_aligned_to_excel('aligned_cultivars_positive_1ppm_min3.xlsx', aligned_pos)
#save_aligned_to_excel('aligned_cultivars_negative_1ppm_min3.xlsx', aligned_neg)

To be continued ...

In [16]:
ppmtol = 1.0
min_samples = 1 #Now it has to be 1
positive = aligned_pos.values()
negative = aligned_neg.values()

aligned_all_pos = align(positive, ppmtol=ppmtol, min_samples=min_samples)
aligned_all_neg = align(negative, ppmtol=ppmtol, min_samples=min_samples)

------ Aligning tables -------------
 Samples to align: [[('CAN', '14'), ('CAN', '15'), ('CAN', '16')], [('CS', '29'), ('CS', '30'), ('CS', '31')], [('LAB', '8'), ('LAB', '9'), ('LAB', '10')], [('PN', '23'), ('PN', '24'), ('PN', '25')], [('REG', '38'), ('REG', '39'), ('REG', '40')], [('RIP', '17'), ('RIP', '18'), ('RIP', '19')], [('RL', '26'), ('RL', '27'), ('RL', '28')], [('ROT', '20'), ('ROT', '21'), ('ROT', '22')], [('RU', '35'), ('RU', '36'), ('RU', '37')], [('SYL', '11'), ('SYL', '12'), ('SYL', '13')], [('TRI', '32'), ('TRI', '33'), ('TRI', '34')], [('CFN', '10713-1'), ('CFN', '10713-2'), ('CFN', '10713-3')], [('CHT', '13514-1'), ('CHT', '13514-2'), ('CHT', '13514-3')], [('SB', '53211-1'), ('SB', '53211-2'), ('SB', '53211-3')]]
- Extracting all features...
  Done, (total 5288 features in 14 samples)
- Grouping and joining...
  Done, 2105 groups found
Elapsed time: 00m 01.388s

Sample coverage of features
 1096 features in 1 samples
  334 features in 2 samples
  219 features in 3 s

In [17]:
outdict = {'POSITIVE': aligned_all_pos, 'NEGATIVE': aligned_all_neg}
#save_aligned_to_excel('aligned_1ppm_min3_1ppm.xlsx', outdict)

#aligned_all_pos.to_csv('aligned_1ppm_min3_1ppm_positive.csv', with_labels=True, sep=',')
#aligned_all_neg.to_csv('aligned_1ppm_min3_1ppm_negative.csv', with_labels=True, sep=',')

In [18]:
ppmtol = 2.0
min_samples = 3
positive = aligned_pos.values()
negative = aligned_neg.values()

aligned_all_pos = align(positive, ppmtol=ppmtol, min_samples=min_samples)
aligned_all_neg = align(negative, ppmtol=ppmtol, min_samples=min_samples)

------ Aligning tables -------------
 Samples to align: [[('CAN', '14'), ('CAN', '15'), ('CAN', '16')], [('CS', '29'), ('CS', '30'), ('CS', '31')], [('LAB', '8'), ('LAB', '9'), ('LAB', '10')], [('PN', '23'), ('PN', '24'), ('PN', '25')], [('REG', '38'), ('REG', '39'), ('REG', '40')], [('RIP', '17'), ('RIP', '18'), ('RIP', '19')], [('RL', '26'), ('RL', '27'), ('RL', '28')], [('ROT', '20'), ('ROT', '21'), ('ROT', '22')], [('RU', '35'), ('RU', '36'), ('RU', '37')], [('SYL', '11'), ('SYL', '12'), ('SYL', '13')], [('TRI', '32'), ('TRI', '33'), ('TRI', '34')], [('CFN', '10713-1'), ('CFN', '10713-2'), ('CFN', '10713-3')], [('CHT', '13514-1'), ('CHT', '13514-2'), ('CHT', '13514-3')], [('SB', '53211-1'), ('SB', '53211-2'), ('SB', '53211-3')]]
- Extracting all features...
  Done, (total 5288 features in 14 samples)
- Grouping and joining...
  Done, 1899 groups found
Elapsed time: 00m 01.332s

- 1217 groups were discarded (#samples < 3)
Sample coverage of features
  189 features in 3 samples
  147

In [19]:
outdict = {'POSITIVE': aligned_all_pos, 'NEGATIVE': aligned_all_neg}
#save_aligned_to_excel('aligned_1ppm_min3_2ppm.xlsx', outdict)

#aligned_all_pos.to_csv('aligned_1ppm_min3_2ppm_positive.csv', with_labels=True, sep=',')
#aligned_all_neg.to_csv('aligned_1ppm_min3_2ppm_negative.csv', with_labels=True, sep=',')

### Aligning all 39 samples together (not aligning replicates first)

First, putting all samples in the same list with the correct sample_names

In [20]:
posi = []
nega = []
for k, s in all_spectra.items():
    if k.upper()[-8:-1]=='POSITIV':
        #s[0].columns = [data[k.split()[0]]['names']['sample_names'][0]]
        #s[1].columns = [data[k.split()[0]]['names']['sample_names'][1]]
        #s[2].columns = [data[k.split()[0]]['names']['sample_names'][2]]
        posi.append(s[0])
        posi.append(s[1])
        posi.append(s[2])
    if k.upper()[-8:-1]=='NEGATIV':
        #s[0].columns = [data[k.split()[0]]['names']['sample_names'][0]]
        #s[1].columns = [data[k.split()[0]]['names']['sample_names'][1]]
        #s[2].columns = [data[k.split()[0]]['names']['sample_names'][2]]
        nega.append(s[0])
        nega.append(s[1])
        nega.append(s[2])

In [21]:
ppmtol = 1.0
min_samples = 2 # 2 for the all_1ppm_min2_neg/pos, 6 for the all_1ppm_min6_neg/pos and 13 for the all_1ppm_min13_neg/pos

aligned_all_positive = align(posi, ppmtol, min_samples)
aligned_all_negative = align(nega, ppmtol, min_samples)

------ Aligning tables -------------
 Samples to align: [[('CAN', '14')], [('CAN', '15')], [('CAN', '16')], [('CS', '29')], [('CS', '30')], [('CS', '31')], [('LAB', '8')], [('LAB', '9')], [('LAB', '10')], [('PN', '23')], [('PN', '24')], [('PN', '25')], [('REG', '38')], [('REG', '39')], [('REG', '40')], [('RIP', '17')], [('RIP', '18')], [('RIP', '19')], [('RL', '26')], [('RL', '27')], [('RL', '28')], [('ROT', '20')], [('ROT', '21')], [('ROT', '22')], [('RU', '35')], [('RU', '36')], [('RU', '37')], [('SYL', '11')], [('SYL', '12')], [('SYL', '13')], [('TRI', '32')], [('TRI', '33')], [('TRI', '34')], [('CFN', '10713-1')], [('CFN', '10713-2')], [('CFN', '10713-3')], [('CHT', '13514-1')], [('CHT', '13514-2')], [('CHT', '13514-3')], [('SB', '53211-1')], [('SB', '53211-2')], [('SB', '53211-3')]]
- Extracting all features...
  Done, (total 71660 features in 42 samples)
- Grouping and joining...
  Done, 39690 groups found
Elapsed time: 00m 23.583s

- 30548 groups were discarded (#samples < 2)
Sa

In [22]:
#aligned_all_positive = mtl.add_labels(aligned_all_positive,
#                                      ['CAN','CS','LAB','PN','REG','RIP','RL','ROT','RU','SYL','TRI','CFN','CHT','SB'])
#aligned_all_negative = mtl.add_labels(aligned_all_negative,
#                                      ['CAN','CS','LAB','PN','REG','RIP','RL','ROT','RU','SYL','TRI','CFN','CHT','SB'])

### Storing unbiased alignments into the hdf5store file

In [23]:
alignments.put('all_1ppm_min2_pos', aligned_all_positive)
# Nomenclature: all samples at 1ppm with n min_samples

In [24]:
alignments.put('all_1ppm_min2_neg', aligned_all_negative)
# Nomenclature: all samples at 1ppm with n min_samples

### Comparing with the same alignment made with the previous version of align - older (without SYL)

In [25]:
def read_aligned_files(filename):
    """Short function to read the aligned files fast while putting the MultiIndex in the correct order for the CDL accessor."""
    df = pd.read_csv(filename, header = None, index_col = [0])
    df.index.name = 'm/z'
    mi = pd.concat([df.iloc[1, :],df.iloc[0, :]], axis = 'columns')
    mi = pd.MultiIndex.from_frame(mi)
    final_file = pd.read_csv(filename, header = [0,1], index_col = [0])
    final_file.columns = mi
    return final_file
import pandas as pd

In [32]:
#aligned_all_pos2 = read_aligned_files('aligned_1ppm_min1-2_1ppm_positive.csv')
#aligned_all_neg2 = read_aligned_files('aligned_1ppm_min1-2_1ppm_negative.csv')

In [33]:
#aligned_all_negative.columns.names = ['Label','Sample']

In [34]:
#from pandas.testing import assert_frame_equal
#assert_frame_equal(aligned_all_negative, aligned_all_neg2)

### It gives the same results, should it?