Exploratory code to figure out the best way to bin the mz's.

- Rounding to the nearest integer (neg, pos, all)
- Defining bins of equal mz width (neg, pos, all)
- Defining bins with equal numbers of fragments (neg, pos, all)
  - (i.e. partitioning mz’s based on quantiles)


In [1]:
import pandas as pd
import json
import copy

In [2]:
fname = '../../data/clean/clean_spectra.json'
with open(fname, 'r') as f:
    all_spectra = json.load(f)

In [3]:
len(all_spectra)

4899

In [4]:
all_spectra.items()[0]

(u'KDXKERNSBIXSRK-YFKPBYRVSA-N_MS2-12_Negative',
 {u'class': u'Carboxylic acids and derivatives;',
  u'inchi': u'KDXKERNSBIXSRK-YFKPBYRVSA-N',
  u'ionization': u'Negative',
  u'kingdom': u'Organic compounds',
  u'parentmass': 146.105527702,
  u'peaks': [[41.1, 0.004004004],
   [57.0, 0.001001001],
   [58.9, 0.023023023],
   [69.2, 0.1911911912],
   [70.8, 0.007007007],
   [73.2, 0.044044044],
   [83.3, 0.005005005],
   [97.1, 0.025025025],
   [99.0, 0.0590590591],
   [101.1, 0.02002002],
   [113.2, 0.0750750751],
   [114.6, 0.002002002],
   [145.1, 1.0]],
  u'sub_class': u'Amino acids, peptides, and analogues'})

In [5]:
# Make tidy dataframe with spectra-related metadata, mz, intensity
dflst = []

spec_keys = ['inchi', 'ionization', 'kingdom', 'sub_class', 'class', 'parentmass']

for spec_id, spectrum in all_spectra.iteritems():
    # Get the spectrum-related metadata
    spec_metadata = [spectrum[k] for k in spec_keys]
    # Get the spectra number from the label
    spec_metadata += [spec_id.split('_')[1].split('-')[1]]
    # Add each mz, int pair as its own entry
    dflst += [spec_metadata + [p[0], p[1]] for p in spectrum['peaks']]

In [6]:
df = pd.DataFrame(dflst, columns=spec_keys + ['scan_id', 'mz', 'intensity'])

In [7]:
df.head()

Unnamed: 0,inchi,ionization,kingdom,sub_class,class,parentmass,scan_id,mz,intensity
0,KDXKERNSBIXSRK-YFKPBYRVSA-N,Negative,Organic compounds,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives;,146.105528,12,41.1,0.004004
1,KDXKERNSBIXSRK-YFKPBYRVSA-N,Negative,Organic compounds,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives;,146.105528,12,57.0,0.001001
2,KDXKERNSBIXSRK-YFKPBYRVSA-N,Negative,Organic compounds,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives;,146.105528,12,58.9,0.023023
3,KDXKERNSBIXSRK-YFKPBYRVSA-N,Negative,Organic compounds,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives;,146.105528,12,69.2,0.191191
4,KDXKERNSBIXSRK-YFKPBYRVSA-N,Negative,Organic compounds,"Amino acids, peptides, and analogues",Carboxylic acids and derivatives;,146.105528,12,70.8,0.007007


Before we start binning things, we're going to need to remove duplicates peaks. Since the presence of duplicates might depend on ionization mode, we need to do this for each of the modes.

In [8]:
df['ionization'].unique()

array([u'Negative', u'Positive', u'N/A'], dtype=object)

In [9]:
df.query('ionization == "N/A"')['inchi'].unique().shape

(67,)

In [10]:
pos = df.query('ionization == "Positive"')
neg = df.query('ionization == "Negative"')
# and df has all of the scans, including the ones labeled N/A

In [11]:
# If there are duplicate peaks, we'll just pick the one with the highest intensity
def remove_dup_mzs(df):
    """
    Remove duplicate mz's in df, keeping only the ones with the highest intensity.
    """
    return (df
        .sort_values(by='intensity', ascending=False)
        .drop_duplicates(subset=['inchi', 'mz'], keep='first'))

In [12]:
pos = remove_dup_mzs(pos)
neg = remove_dup_mzs(neg)
both = remove_dup_mzs(df)

In [24]:
tmp = both.query('inchi == "ACFIXJIJDZMPPO-NCHANQSKSA-N"')
tmp.head()

Unnamed: 0,inchi,ionization,kingdom,sub_class,class,parentmass,scan_id,mz,intensity
76127,ACFIXJIJDZMPPO-NCHANQSKSA-N,Positive,Chemical entities,(5'->5')-dinucleotides,"Nucleosides, nucleotides, and analogues;",745.091102,1,509.097,100.0
65140,ACFIXJIJDZMPPO-NCHANQSKSA-N,Positive,Chemical entities,(5'->5')-dinucleotides,"Nucleosides, nucleotides, and analogues;",745.091102,2,435.28,100.0
102882,ACFIXJIJDZMPPO-NCHANQSKSA-N,Positive,Chemical entities,(5'->5')-dinucleotides,"Nucleosides, nucleotides, and analogues;",745.091102,0,169.605,100.0
76115,ACFIXJIJDZMPPO-NCHANQSKSA-N,Positive,Chemical entities,(5'->5')-dinucleotides,"Nucleosides, nucleotides, and analogues;",745.091102,1,135.889,96.11
102901,ACFIXJIJDZMPPO-NCHANQSKSA-N,Positive,Chemical entities,(5'->5')-dinucleotides,"Nucleosides, nucleotides, and analogues;",745.091102,0,345.77,93.015


In [62]:
print(pos.shape, neg.shape, both.shape)

((70624, 9), (50428, 9), (121780, 9))


In [65]:
pos.head()

Unnamed: 0,inchi,ionization,kingdom,sub_class,class,parentmass,scan_id,mz,intensity
21568,AGPKZVBTJJNPAG-UHNVWZDZSA-N,Positive,Chemical entities,Carboxylic acids and derivatives,Organic acids and derivatives;,131.094629,2,41.353,100.0
56738,IERHLVCPSMICTF-XVFCMESISA-N,Positive,Chemical entities,Pyrimidine nucleotides,"Nucleosides, nucleotides, and analogues;",323.051851,2,112.0,100.0
46666,FBZONXHGGPHHIY-UHFFFAOYSA-N,Positive,Chemical entities,Quinolines and derivatives,Organoheterocyclic compounds;,205.037508,0,205.971,100.0
111753,POZRVZJJTULAOH-LHZXLZLDSA-N,Positive,Organic compounds,Estrane steroids,Steroids and steroid derivatives;,337.204179,2,90.971,100.0
98210,XKMLYUALXHKNFT-UUOKFMHZSA-N,Positive,Chemical entities,Purine nucleotides,"Nucleosides, nucleotides, and analogues;",522.99066,0,445.0,100.0


# Raw mz's

In [75]:
widepos = pos.pivot(index='inchi', columns='mz', values='intensity')
widepos = widepos.fillna(0.0)
widepos.shape

(712, 34551)

In [76]:
pd.merge(widepos, pos[['inchi', 'kingdom', 'sub_class', 'class']].drop_duplicates(),
         left_index=True, right_on='inchi', how='left').shape

(712, 34555)

# Round mz's to the nearest integer

In [64]:
posint = copy.deepcopy(pos)
print(posint.shape)
posint['mz'] = posint['mz'].astype(int)
posint = remove_dup_mzs(posint)
print(posint.shape)

(70624, 9)
(38740, 9)
