# NMR Assisted MS Metabolite Identification

Author: Olatomiwa Bifarin<br>
Department of Biochemistry and Molecular Biology<br>
University of Georgia<br>
Edison Lab<br>

Last edited: 15MAR2020

_This is a static version of a Jupyter notebook, and work (documentation) is still in progress_ 

In [90]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import style
import scipy
#For Seaborn plots
import seaborn as sns; sns.set(style='white')
#To ignore warning
import warnings
warnings.filterwarnings('ignore')

# More sharp and legible graphics
%config InlineBackend.figure_format = 'retina'

# Set seaborn figure labels to 'talk', to be more visible. 
sns.set_context('talk', font_scale=0.8)

## 1. Import and Set up all (MS) features with < 0.05 _q_-values. 

_Import all 472 features_

In [91]:
qfeatures = pd.read_excel('RCC_472features.xlsx')
#qfeatures.head()

_Import all MS features_

In [92]:
dfMS = pd.read_excel('data/RCC_S280_combinedData_7097.xlsx')
dfMS_trans = dfMS.T # Transpose dataframe
IDs = np.arange(1, dfMS_trans.shape[1] + 1) # Generate a range of numbers (7098) to replace nan values in the name row
IDs = pd.Series(IDs) # creating series 
dfMS_trans.iloc[4] = dfMS_trans.iloc[4].fillna(IDs) # replace nan with the IDs, and replace the name row

In [93]:
MS = dfMS_trans.drop(['Mode', 'Molecular Weight', 'Name', 'RT [min]', 'Formula'])
MS.index.name = 'MS_ID'
MS.reset_index(inplace=True) # Converts the row names into a column.
header = MS.iloc[0] # Create a new variable called 'header' from the first row of the dataset
MS = MS[1:] # Replace the dataframe with a new one which does not contain the first row
MS.rename(columns = header, inplace=True)# Rename the dataframe's column values with the header variable
MS.rename(columns={'ID':'Sample ID'}, inplace = True) # Rename column name
# To rest the row index to start from 0
MS.index = MS.index - 1 
#MS.head()

_Import dataframe containing clinical ID_

In [94]:
path = "data/Sample_List_MS.xlsx"
fields = \
[
    'Sample ID',
    'Patient ID'
]

MSpatid = pd.read_excel(path)[fields]
#MSpatid.head()

`MSData` is the dataframe for the MS Peak Intensities with <mark>Patient ID</mark> and <mark>Groups</mark>

In [95]:
MSData = pd.merge(MS, MSpatid, on='Sample ID') # Merge the two dataframe on Sample ID
#MSData.head()

In [96]:
lst = [qfeatures['ID'].tolist(), ['Patient ID']] # create a list of list. 
flat_list = [item for sublist in lst for item in sublist] # flatten out the list
len(flat_list)

473

In [97]:
MS_qfeatures = MSData.filter(flat_list)
#MS_qfeatures.head()

In [98]:
MS_qfeatures.shape

(280, 473)

## 2. Import and Setup all NMR features. 

In [99]:
dfNMR = pd.read_excel('data/binned_NMRfeatures_22JAN2020.xlsx')
#dfNMR.head()

In [100]:
dfNMR.columns

Index(['Run_ID', 'Patient ID', 'Collection', 'Gender', 'Race', 'BMI', 'Smoker',
       'Age', 'Sample_description', 'Groups', 'Yvec', 'Run_IDOrig', 'unk1',
       'unk2', 'bile_acid1', 'bile_acid2', 'HIVA', 'lactate', 'unk3',
       'acetate', 'acetone', 'unk4', 'acetoacetate', 'unk5', 'pyruvate',
       'citrate', 'DMA', 'unk6', 'methylguanidine', 'unk7', 'choline',
       'scylloinositol', 'taurine', 'acetoacetate_4HPA', 'glycine',
       'mannitol_1', 'mannitol_2', 'creatine', 'glycolate',
       'hippurate_hydroxyhippurate', 'Tatrate', 'unk8', 'unk9', 'fumarate',
       '4HPA', 'hippurate_2', 'hippurate_3', 'aminohippurate',
       'indoxylsulfate', 'hippurate_4', 'hypoxanthine_1', 'hypoxanthine_2',
       'formate', 'unk10', 'Trigonelline_1', 'Trigonellinamide_1',
       'Trigonellinamide_2', 'Trigonelline_2', 'Trigonellinamide_3', 'unk11'],
      dtype='object')

In [101]:
dfNMR = pd.read_excel('data/binned_NMRfeatures_22JAN2020.xlsx')
dfNMR.drop(['Run_ID', 'Collection', 'Gender', 
            'Race', 'BMI', 'Smoker', 'Age', 'Sample_description', 
            'Groups', 'Yvec', 'Run_IDOrig',], axis=1, inplace=True)
#dfNMR.head()

In [102]:
dfNMR.rename(columns={'Sample_ID':'Patient ID'}, inplace = True) # Rename column name
#dfNMR.head()

## 3. Select Samples Common to both MS and NMR

In [103]:
print(dfNMR.shape)
print(MS_qfeatures.shape)

(284, 49)
(280, 473)


_Merge NMR and MS dataframes  on Patient ID_

In [104]:
combined = pd.merge(MS_qfeatures, dfNMR, on='Patient ID') # Merge the two dataframe on Sample ID
combined.shape

(256, 521)

_Select Samples Common to both MS and NMR_

In [215]:
final_NMR = dfNMR[dfNMR['Patient ID'].isin(combined['Patient ID'])]
final_MS = MS_qfeatures[MS_qfeatures['Patient ID'].isin(combined['Patient ID'])]
print(final_NMR.shape)
print(final_MS.shape)

(256, 49)
(256, 473)


## 4. Correlations

In [216]:
final_NMR.drop(['Patient ID'], axis=1, inplace=True)
final_MS.drop(['Patient ID'], axis=1, inplace=True)

In [221]:
NMRcorr_compd, MScorr_compd = [], []
for column in final_NMR.columns:
    for column2 in final_MS.columns:
        col_corr = pd.DataFrame(np.corrcoef(final_NMR[column].astype(float), 
                                            final_MS[column2].astype(float))).abs()
        # Select upper triangle of correlation matrix
        #corr = col_corr.where(np.triu(np.ones(col_corr.shape), k=1).astype(np.bool))
        if col_corr[1][0] >= 0.30: 
            #column to drop NMR's; column2 to drop MS's 
            NMRcorr_compd.append(column)
            MScorr_compd.append(column2)

In [222]:
NMRcorr_compd;

In [223]:
MScorr_compd;

In [224]:
corr_result = pd.DataFrame(
    {'NMR_Features': NMRcorr_compd,
     'MS_Features': MScorr_compd
    })
corr_result

Unnamed: 0,NMR_Features,MS_Features
0,unk1,3171
1,Tatrate,147
2,unk8,926
3,fumarate,926
4,4HPA,278
5,4HPA,6385
6,aminohippurate,474
7,indoxylsulfate,672
8,hypoxanthine_1,995
9,hypoxanthine_2,1771


In [225]:
# Import MS_labels
MS_labels = pd.read_excel('data/MS_labels.xlsx')
# Check for potential MS ID.
MSlabels_corr = MS_labels[MS_labels.ID.isin(MScorr_compd)]
MSlabels_corr

Unnamed: 0,ID,Mode,RT [min],Name,Formula
146,147,positive,2.58,147,C33 H41 N O6 P2
277,278,positive,3.057,278,
473,474,positive,1.917,474,
671,672,positive,2.66,Moxaverine,C20 H21 N O2
925,926,positive,2.25,926,C13 H29 N10 O5 P
994,995,positive,0.882,995,
1770,1771,positive,3.001,1771,C13 H24 N4 O4
1990,1991,positive,3.247,1991,
3170,3171,positive,1.131,3171,C12 H17 O2 P S
5462,5463,negative,3.758,N-Acetyl-L-methionine,C7 H13 N O3 S


### Manual Representation for 0.30 cut off

In [186]:
combined;

In [189]:
final_MS.shape

(256, 472)

In [207]:
final_MS.columns = final_MS.columns.astype(str) # convert column head to numeric

In [None]:
np.corrcoef(final_NMR[column].astype(float), 
                                            final_MS[column2].astype(float))).abs()

In [213]:
col_corr =np.corrcoef(final_MS['1991'].astype(float), final_MS['5463'].astype(float))
col_corr[1][0]

0.5256788562499721

## Function

_[extract column value based on another column pandas dataframe](https://stackoverflow.com/questions/36684013/extract-column-value-based-on-another-column-pandas-dataframe)_

In [None]:
for feat in corr_result['NMR_Features']:
    for feat2 in corr_result['NMR_Features']:
        if feat = feat2: 
            corr_result.loc[corr_result['NMR_Features'] == feat, 
                            'MS_features'].iloc[0]
    
    
    

In [139]:
l = ["a","b","b"]
l.count("a")
l.count("b")

2

In [151]:
metab, metab_counter = [], []
for feat in corr_result['NMR_Features']:
    metab.append(feat)
    metab_counter.append(list(corr_result['NMR_Features']).count(feat))

In [152]:
metab

['unk1',
 'Tatrate',
 'unk8',
 'fumarate',
 '4HPA',
 '4HPA',
 'aminohippurate',
 'indoxylsulfate',
 'hypoxanthine_1',
 'hypoxanthine_2',
 'formate',
 'formate']

In [153]:
metab_counter

[1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 2]

In [147]:
table = pd.DataFrame(
    {'Name': metab,
     'Occurence': metab_counter
    })
table

Unnamed: 0,Name,Occurence
0,unk1,1
1,Tatrate,1
2,unk8,1
3,fumarate,1
4,4HPA,2
5,4HPA,2
6,aminohippurate,1
7,indoxylsulfate,1
8,hypoxanthine_1,1
9,hypoxanthine_2,1


In [115]:
# Import MS_labels
MS_labels = pd.read_excel('data/MS_labels.xlsx')
# Check for potential MS ID.
MSlabels_corr = MS_labels[MS_labels.ID.isin(MScorr_compd)]
MSlabels_corr

Unnamed: 0,ID,Mode,RT [min],Name,Formula
146,147,positive,2.58,147,C33 H41 N O6 P2
277,278,positive,3.057,278,
473,474,positive,1.917,474,
671,672,positive,2.66,Moxaverine,C20 H21 N O2
925,926,positive,2.25,926,C13 H29 N10 O5 P
994,995,positive,0.882,995,
1770,1771,positive,3.001,1771,C13 H24 N4 O4
1990,1991,positive,3.247,1991,
3170,3171,positive,1.131,3171,C12 H17 O2 P S
5462,5463,negative,3.758,N-Acetyl-L-methionine,C7 H13 N O3 S
