# NMR Assisted MS Metabolite Identification

Author: Olatomiwa Bifarin<br>
Department of Biochemistry and Molecular Biology<br>
University of Georgia<br>
Edison Lab<br>

Last edited: 15MAR2020

_This is a static version of a Jupyter notebook, and work (documentation) is still in progress_ 

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import style
import scipy
#For Seaborn plots
import seaborn as sns; sns.set(style='white')
#To ignore warning
import warnings
warnings.filterwarnings('ignore')

# More sharp and legible graphics
%config InlineBackend.figure_format = 'retina'

# Set seaborn figure labels to 'talk', to be more visible. 
sns.set_context('talk', font_scale=0.8)

## 1. Import and Set up all (MS) features with < 0.05 _q_-values. 

_Import all 472 features_

In [2]:
qfeatures = pd.read_excel('RCC_472features.xlsx')
#qfeatures.head()

_Import all MS features_

In [3]:
dfMS = pd.read_excel('data/RCC_S280_combinedData_7097.xlsx')
dfMS_trans = dfMS.T # Transpose dataframe
IDs = np.arange(1, dfMS_trans.shape[1] + 1) # Generate a range of numbers (7098) to replace nan values in the name row
IDs = pd.Series(IDs) # creating series 
dfMS_trans.iloc[4] = dfMS_trans.iloc[4].fillna(IDs) # replace nan with the IDs, and replace the name row

In [4]:
MS = dfMS_trans.drop(['Mode', 'Molecular Weight', 'Name', 'RT [min]', 'Formula'])
MS.index.name = 'MS_ID'
MS.reset_index(inplace=True) # Converts the row names into a column.
header = MS.iloc[0] # Create a new variable called 'header' from the first row of the dataset
MS = MS[1:] # Replace the dataframe with a new one which does not contain the first row
MS.rename(columns = header, inplace=True)# Rename the dataframe's column values with the header variable
MS.rename(columns={'ID':'Sample ID'}, inplace = True) # Rename column name
# To rest the row index to start from 0
MS.index = MS.index - 1 
#MS.head()

_Import dataframe containing clinical ID_

In [5]:
path = "data/Sample_List_MS.xlsx"
fields = \
[
    'Sample ID',
    'Patient ID'
]

MSpatid = pd.read_excel(path)[fields]
#MSpatid.head()

`MSData` is the dataframe for the MS Peak Intensities with <mark>Patient ID</mark> and <mark>Groups</mark>

In [6]:
MSData = pd.merge(MS, MSpatid, on='Sample ID') # Merge the two dataframe on Sample ID
#MSData.head()

In [7]:
lst = [qfeatures['ID'].tolist(), ['Patient ID']] # create a list of list. 
flat_list = [item for sublist in lst for item in sublist] # flatten out the list
len(flat_list)

473

In [8]:
MS_qfeatures = MSData.filter(flat_list)
#MS_qfeatures.head()

In [9]:
MS_qfeatures.shape

(280, 473)

## 2. Import and Setup all NMR features. 

In [10]:
dfNMR = pd.read_excel('data/binned_NMRfeatures_22JAN2020.xlsx')
#dfNMR.head()

In [11]:
dfNMR.columns

Index(['Run_ID', 'Patient ID', 'Collection', 'Gender', 'Race', 'BMI', 'Smoker',
       'Age', 'Sample_description', 'Groups', 'Yvec', 'Run_IDOrig', 'unk1',
       'unk2', 'bile_acid1', 'bile_acid2', 'HIVA', 'lactate', 'unk3',
       'acetate', 'acetone', 'unk4', 'acetoacetate', 'unk5', 'pyruvate',
       'citrate', 'DMA', 'unk6', 'methylguanidine', 'unk7', 'choline',
       'scylloinositol', 'taurine', 'acetoacetate_4HPA', 'glycine',
       'mannitol_1', 'mannitol_2', 'creatine', 'glycolate',
       'hippurate_hydroxyhippurate', 'Tatrate', 'unk8', 'unk9', 'fumarate',
       '4HPA', 'hippurate_2', 'hippurate_3', 'aminohippurate',
       'indoxylsulfate', 'hippurate_4', 'hypoxanthine_1', 'hypoxanthine_2',
       'formate', 'unk10', 'Trigonelline_1', 'Trigonellinamide_1',
       'Trigonellinamide_2', 'Trigonelline_2', 'Trigonellinamide_3', 'unk11'],
      dtype='object')

In [12]:
dfNMR = pd.read_excel('data/binned_NMRfeatures_22JAN2020.xlsx')
dfNMR.drop(['Run_ID', 'Collection', 'Gender', 
            'Race', 'BMI', 'Smoker', 'Age', 'Sample_description', 
            'Groups', 'Yvec', 'Run_IDOrig',], axis=1, inplace=True)
#dfNMR.head()

In [13]:
dfNMR.rename(columns={'Sample_ID':'Patient ID'}, inplace = True) # Rename column name
#dfNMR.head()

## 3. Select Samples Common to both MS and NMR

In [14]:
print(dfNMR.shape)
print(MS_qfeatures.shape)

(284, 49)
(280, 473)


_Merge NMR and MS dataframes  on Patient ID_

In [15]:
combined = pd.merge(MS_qfeatures, dfNMR, on='Patient ID') # Merge the two dataframe on Sample ID
combined.shape

(256, 521)

_Select Samples Common to both MS and NMR_

In [16]:
final_NMR = dfNMR[dfNMR['Patient ID'].isin(combined['Patient ID'])]
final_MS = MS_qfeatures[MS_qfeatures['Patient ID'].isin(combined['Patient ID'])]
print(final_NMR.shape)
print(final_MS.shape)

(256, 49)
(256, 473)


## 4. Correlations

In [17]:
final_NMR.drop(['Patient ID'], axis=1, inplace=True)
final_MS.drop(['Patient ID'], axis=1, inplace=True)

In [18]:
NMRcorr_compd, MScorr_compd = [], []
for column in final_NMR.columns:
    for column2 in final_MS.columns:
        col_corr = pd.DataFrame(np.corrcoef(final_NMR[column].astype(float), 
                                            final_MS[column2].astype(float))).abs()
        # Select upper triangle of correlation matrix
        #corr = col_corr.where(np.triu(np.ones(col_corr.shape), k=1).astype(np.bool))
        if col_corr[1][0] > 0.50: 
            #column to drop NMR's; column2 to drop MS's 
            NMRcorr_compd.append(column)
            MScorr_compd.append(column2)

In [19]:
NMRcorr_compd;

In [20]:
MScorr_compd;

In [21]:
corr_result = pd.DataFrame(
    {'NMR Features': NMRcorr_compd,
     'MS Features': MScorr_compd
    })
corr_result

Unnamed: 0,NMR Features,MS Features
0,4HPA,6385


In [22]:
# Import MS_labels
MS_labels = pd.read_excel('data/MS_labels.xlsx')
# Check for potential MS ID.
MSlabels_corr = MS_labels[MS_labels.ID.isin(MScorr_compd)]
MSlabels_corr

Unnamed: 0,ID,Mode,RT [min],Name,Formula
6384,6385,negative,2.655,6385,C8 H7 N2 O P
