# Summary

This notebook takes a raw csv file of metabolomics peak intensity values data for mice samples with varying degrees of acute kidney injury. Missing values for the data are imputed using KNN and the data is scaled. A final csv is outputted with relevant imputed and scaled data for further analysis.

In [14]:
import pandas as pd
import os
import numpy as np
import datetime as dt
import re

In [15]:
data_dir = './data/'
today = dt.date.today()

In [16]:
metabolomics_data_raw = pd.read_csv(os.path.join(data_dir, 'lung_data_raw.csv'), header = [0,1])

In [17]:
cols = list(metabolomics_data_raw.columns)
cols = [(i,re.sub("Unnamed.*", '', y)) for i,y in cols]
metabolomics_data_raw.columns = pd.MultiIndex.from_tuples(cols)

In [18]:
# update compound names
all_compounds_names = pd.read_csv(os.path.join(data_dir, 'all_compounds.csv'))
compound_mapper = dict(zip(all_compounds_names.CmpdID, all_compounds_names.compound))

In [19]:
metabolomics_data_raw['compound'] = metabolomics_data_raw.CmpdID.map(compound_mapper)

In [20]:
sample_types = ['Normal', '4 Hr Sham', '4 Hr AKI', '24 Hr Sham', '24 Hr AKI', '7 Day Sham', '7 Day AKI']

In [21]:
metabolomics_data_raw[sample_types].isna().sum().sum() # number of missing values

0

In [22]:
(metabolomics_data_raw[sample_types] == 0).sum().sum() # number of values = 0

10

In [23]:
# replace 0 values with nan for KNN imputation later
metabolomics_data_raw[sample_types] = metabolomics_data_raw[sample_types].replace(0, np.nan).values

In [24]:
from fancyimpute import KNN
metabolomics_data_raw[sample_types] = KNN(k=5, orientation='columns').fit_transform(metabolomics_data_raw[sample_types])

ModuleNotFoundError: No module named 'fancyimpute'

In [117]:
def autoscale(cmpd_row):
    return (cmpd_row - cmpd_row.mean()) / cmpd_row.std()

In [118]:
def rangescale(cmpd_row):
    return ((cmpd_row - cmpd_row.mean())) / (cmpd_row.max() - cmpd_row.min())

In [119]:
def quantileNormalize(df_input): # row is sample, column is feature
    df = df_input.copy()
    #compute rank
    dic = {}
    for col in df:
        dic.update({col : sorted(df[col])})
    sorted_df = pd.DataFrame(dic)
    rank = sorted_df.mean(axis = 1).tolist()
    #sort
    for col in df:
        t = np.searchsorted(np.sort(df[col]), df[col])
        df[col] = [rank[i] for i in t]
    return df

In [120]:
#plasma_raw[sample_types] = quantileNormalize(plasma_raw[sample_types].T).T.values
metabolomics_data_raw[sample_types] = metabolomics_data_raw[sample_types].apply(autoscale, axis = 1).values

In [121]:
metabolomics_data_raw[sample_types].isna().sum().sum()

0

In [122]:
metabolomics_data_raw[['compound','Pathway'] + sample_types].to_csv(os.path.join(data_dir, 'autoscaled_lung_{}.csv'.format(today)), index = False)

In [30]:
metabolomics_data_scaled_pca = metabolomics_data_raw[['compound'] + sample_types]

In [32]:
metabolomics_data_scaled_pca.columns = metabolomics_data_scaled_pca.columns.get_level_values(0)

In [38]:
metabolomics_data_scaled_pca = metabolomics_data_scaled_pca.set_index('compound').T.reset_index(drop = False).rename(columns = {'index':'Label'})

In [40]:
metabolomics_data_scaled_pca.to_csv(os.path.join(data_dir, 'autoscaled_lung_for_pca_2020-06-23.csv'), index = False)