In [None]:
from scipy import stats
import pandas as pd
import numpy as np
import math
import os
import multiprocessing
import progressbar
from time import time

data_path = 'gdrive/My Drive/Summer Research/hmp2-data-stanford/'
preprocessed_data_path = 'gdrive/My Drive/Summer Research/hmp2-data-stanford/Preprocessed/'
wt_data_path = 'gdrive/My Drive/Summer Research/hmp2-data-stanford/Preprocessed/Wavelet Transform/'
paths = [preprocessed_data_path, wt_data_path+'Denoised/',
         wt_data_path+'WT Domain/']

hmp_datas = ['cytokine_abundance','gut_16s_abundance','Lipidomics',
          'metabolome_abundance','Metabolomics','nares_16s_abundance',
          'proteome_abundance','Proteomics','RNAseq_abundance',
          'Targ.proteomics','Transcriptomics_VST_excl_3participants']

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
def get_subjects():
  df = pd.read_csv(data_path+'Subjects.csv', index_col=False)
  subjects = dict()
  for i in range(len(df)):
    subject_data = dict(df.iloc[i,:])
    subject_id = subject_data.pop('SubjectID')
    subjects[subject_id] = subject_data
  return subjects

In [None]:
def get_class_from_visit_id():
  df = pd.read_csv(data_path+'Visit.csv', index_col=False,
                   usecols=['VisitID', 'SubjectID'])
  subjects = get_subjects()
  return dict((df.iloc[i,0],subjects[df.iloc[i,1]]) for i in range(len(df)))

In [None]:
def get_data(f):
  df = pd.read_csv(data_path+f+'.csv', index_col=False)
  gc = get_class_from_visit_id()
  race = list()
  sex = list()
  age = list()
  bmi = list()
  sspg = list()
  classifications = list()
  indices_to_drop = list()
  nan_values = [i for i in range(len(df)) if df.isnull().any(axis=1)[i]]

  bools = list(df.astype('bool').mean(axis=1) < 0.75)
  zero_values = [i for i in range(len(bools)) if bools[i] == True]
  nan_values.extend(zero_values)

  for i in range(len(df)):
    v_id = df.iloc[i,0]
    try:
      if i not in nan_values:
        c = gc[v_id]
        race.append(c['Race'])
        sex.append(c['Sex'])
        age.append(c['Age'])
        bmi.append(c['BMI'])
        sspg.append(c['SSPG'])
        classifications.append(c['IR_IS_classification'])
      else:
        indices_to_drop.append(i)
    except KeyError:
      indices_to_drop.append(i)
  
  df = df.drop(indices_to_drop)
  df.insert(1, 'IR_IS_classification', classifications, False)
  df.insert(1, 'SSPG', sspg, False)
  df.insert(1, 'BMI', bmi, False)
  df.insert(1, 'Age', age, False)
  df.insert(1, 'Sex', sex, False)
  df.insert(1, 'Race', race, False)
  return df.reset_index(drop=True)

In [None]:
def save_all_data():
  for i in hmp_datas:
    df = get_data(i)
    df.to_csv(preprocessed_data_path+i+'.csv', index=False)

In [None]:
#save_all_data()

In [None]:
def check_for_nan():
  for i in hmp_datas:
    df = get_data(i)
    print(i+': '+str(df.isnull().values.sum()))

In [None]:
def matrix_size():
  for i in hmp_datas:
      df = pd.read_csv(preprocessed_data_path+i+'.csv', index_col=False)
      print(i+': '+str(df.shape))

Normalization

In [None]:
def normalization():
  widgets = [' [',
        progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
        '] ',
          progressbar.Bar('#'),' (',
          progressbar.ETA(), ') ',
          progressbar.Counter(format='%(value)d/%(max_value)d')
          ]
  bar = progressbar.ProgressBar(max_value=len(hmp_datas)-2, widgets=widgets).start()
  count = 0

  for j in hmp_datas:
    if j not in ['metabolome_abundance', 'proteome_abundance']:
      count += 1
      bar.update(count)
        
      sid_and_class = pd.read_csv(preprocessed_data_path+j+'.csv', index_col=False,
                        usecols=['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'])
      df = pd.read_csv(preprocessed_data_path+j+'.csv', index_col=False).drop(['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'], axis=1)
      cols = ['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'] + list(df.columns)

      add = 0.1
      df += add
      df = np.log2(df)
      df = pd.concat([sid_and_class, df], axis=1, ignore_index=True)
      df.columns = pd.Index(cols)

      df.to_csv(preprocessed_data_path+'Normalized/'+j+'.csv', index=False)

Wavelet Transform

In [None]:
def fourWTM(n):
  #Filter banks
  h0 = np.array([0.2697890,0.3947890,0.5197890,0.6447890,0.2302110,0.1052110,-0.0197890,-0.1447890])
  h1 = np.array([-0.2825435,0.5553379,0.2385187,-0.0783004, -0.5834819,-0.2666627,0.0501564,0.3669755])
  h2 = np.array([0.4125840,-0.6279376,0.3727824,0.1487574, -0.4125840,-0.1885590,0.0354659,0.2594909])
  h3 = np.array([0.2382055,0.1088646,-0.7275830,0.5572896, -0.2382055,-0.1088646,0.0204763,0.1498171])
  #Matrix of filter banks created for convenience
  h = np.array([h0,h1,h2,h3])

  k = int(n/4)
  T = np.zeros((n,n))
  for j in range(4):
    for i in range(k):
      if 4*i+8 > 4*k:
        T[k*j+i,range((4*i),(4*i+4))] = h[j,range(4)]
        T[k*j+i,range(4)] = h[j,range(4,8)]
      else:
        T[k*j+i,range((4*i),(4*i+8))] = h[j,range(8)]
  return T

In [None]:
def four_Wavelet_Transform(data_path, f):
  #Import HMP data
  sid_and_class = pd.read_csv(data_path+f+'.csv', index_col=False,
                      usecols=['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'])
  hmp = pd.read_csv(data_path+f+'.csv', index_col=False).drop(['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'], axis=1)

  n = hmp.shape[1]
  n = n + 4 - (n % 4)
  z = np.zeros((hmp.shape[0], n-hmp.shape[1]))
  t = fourWTM(n)
  s = np.zeros((hmp.shape[0],n))

  cols = ['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'] + list(hmp.columns)

  hmp_array = np.concatenate((np.array(hmp), z), axis=1)

  ts = np.matmul(hmp_array, t.T)

  #Create WT Domain DataFrame
  wt_type = 'WT Domain'
  df = pd.DataFrame(data=ts)
  df = pd.concat([sid_and_class, df], axis=1, ignore_index=True)

  for i in range(n - hmp.shape[1]):
    cols.append('')
  
  df.columns = pd.Index(cols)

  df.to_csv(data_path+wt_type+'/'+f+'.csv', index=False)

  #Create Denoised DataFrame
  wt_type = 'Denoised'
  dim = int(n/4)
  A1 = np.matmul(ts[:, 0:dim], t[0:dim, :])
  d = np.zeros((hmp.shape[0],dim,3))
  D = np.zeros((hmp.shape[0],n))
  for j in range(3):
    d[:,:,j] = ts[:, (j+1)*dim:(j+2)*dim]
    for k in range(hmp.shape[0]):
      #Denoise details
      lbda = np.std(d[k,:,j])*math.sqrt(2*math.log(dim))
      for i in range(dim):
        if abs(d[k,i,j]) < lbda:
          d[k,i,j] = 0
    D += np.matmul(d[:,:,j], t[(j+1)*dim:(j+2)*dim, :])

  s = A1+D
  df = pd.DataFrame(data=s)
  df = pd.concat([sid_and_class, df], axis=1, ignore_index=True)
  df.columns = pd.Index(cols)

  df.to_csv(data_path+wt_type+'/'+f+'.csv', index=False)

In [None]:
def fourWT_just_wt_domain(data_path, f):
  #Import HMP data
  sid_and_class = pd.read_csv(data_path+f+'.csv', index_col=False,
                      usecols=['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'])
  hmp = pd.read_csv(data_path+f+'.csv', index_col=False).drop(['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'], axis=1)

  n = hmp.shape[1]
  n = n + 4 - (n % 4)
  z = np.zeros((hmp.shape[0], n-hmp.shape[1]))
  t = fourWTM(n)
  s = np.zeros((hmp.shape[0],n))

  cols = ['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'] + list(hmp.columns)

  hmp_array = np.concatenate((np.array(hmp), z), axis=1)

  ts = np.matmul(hmp_array, t.T)

  #Create WT Domain DataFrame
  wt_type = 'WT Domain'
  df = pd.DataFrame(data=ts)
  df = pd.concat([sid_and_class, df], axis=1, ignore_index=True)

  for i in range(n - hmp.shape[1]):
    cols.append('')
  
  df.columns = pd.Index(cols)

  df.to_csv(data_path+wt_type+'/'+f+'.csv', index=False)

In [None]:
def rna_just_wt_domain():
  count = 0
  hold_list_of_processes = list()
  for j in [preprocessed_data_path, preprocessed_data_path+'Normalized/']:
    for i in ['Transcriptomics_VST_excl_3participants','RNAseq_abundance']:
      hold_list_of_processes.append(multiprocessing.Process(target=fourWT_just_wt_domain, args=(j, i)))

  for p in hold_list_of_processes:
    p.start()

  for p in hold_list_of_processes:
    p.join()
    count += 1
    print(str(count) + '/' + str(len(hold_list_of_processes)))

  print('Done')

In [None]:
def transform_small_data():
  widgets = [' [',
        progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
        '] ',
          progressbar.Bar('#'),' (',
          progressbar.ETA(), ') ',
          progressbar.Counter(format='%(value)d/%(max_value)d')
          ]
  bar = progressbar.ProgressBar(max_value=2*(len(hmp_datas)-4), widgets=widgets).start()
  count = 0

  processes = list()
  for j in [preprocessed_data_path, preprocessed_data_path+'Normalized/']:
    for i in hmp_datas:
      if i not in ['metabolome_abundance', 'proteome_abundance',
                  'Transcriptomics_VST_excl_3participants','RNAseq_abundance']:
        p = multiprocessing.Process(target=four_Wavelet_Transform, args=(j, i))
        processes.append(p)
        p.start()

  for p in processes:
    count += 1
    bar.update(count)
    p.join()

In [None]:
def four_Wavelet_Transform_big_denoised(data_path, f):
  #Import HMP data
  sid_and_class = pd.read_csv(data_path+f+'.csv', index_col=False,
                      usecols=['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'])
  hmp = pd.read_csv(data_path+f+'.csv', index_col=False).drop(['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'], axis=1)

  n = hmp.shape[1]
  n = n + 4 - (n % 4)
  z = np.zeros((hmp.shape[0], n-hmp.shape[1]))
  t = fourWTM(n)
  s = np.zeros((hmp.shape[0],n))

  cols = ['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'] + list(hmp.columns)

  hmp_array = np.concatenate((np.array(hmp), z), axis=1)

  ts = np.matmul(hmp_array, t.T)

  widgets = [' [',
        progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
        '] ',
          progressbar.Bar('#'),' (',
          progressbar.ETA(), ') ',
          progressbar.Counter(format='%(value)d/%(max_value)d')
          ]
  bar = progressbar.ProgressBar(max_value=3, widgets=widgets).start()

  #Create Denoised DataFrame
  wt_type = 'Denoised'
  dim = int(n/4)
  A1 = np.matmul(ts[:, 0:dim], t[0:dim, :])
  d = np.zeros((hmp.shape[0],3*dim))
  D = np.zeros((hmp.shape[0],n))

  count = 0
  for j in range(3):
    count += 1
    bar.update(count)
    d[:, (j)*dim:(j+1)*dim] = ts[:, (j+1)*dim:(j+2)*dim]
    for k in range(hmp.shape[0]):
      #Denoise details
      lbda = np.std(d[k, (j)*dim:(j+1)*dim])*math.sqrt(2*math.log(dim))
      for i in range(dim):
        if abs(d[k, j*dim+i]) < lbda:
          d[k, j*dim+i] = 0
  D = np.matmul(d, t[dim:, :])

  s = A1+D
  df = pd.DataFrame(data=s)
  df = pd.concat([sid_and_class, df], axis=1, ignore_index=True)
  for i in range(n - hmp.shape[1]):
    cols.append('')
  df.columns = pd.Index(cols)

  df.to_csv(data_path+wt_type+'/'+f+'.csv', index=False)

In [None]:
def transform_big_data():
  processes = list()
  for j in [preprocessed_data_path, preprocessed_data_path+'Normalized/']:
    for i in ['Transcriptomics_VST_excl_3participants','RNAseq_abundance']:
      p = multiprocessing.Process(target=four_Wavelet_Transform_big_denoised, args=(j, i))
      processes.append(p)
      p.start()

  for p in processes:
    p.join()

In [None]:
transform_big_data()

 [elapsed time: 0:00:13] |###############################| (ETA:  00:00:00) 3/3

MAD Dimension Reduction

In [None]:
def reduce_dims_mad(data_path, rna_data, threshold):
  sid_and_class = pd.read_csv(data_path+rna_data+'.csv', index_col=False,
                    usecols=['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'])
  df = pd.read_csv(data_path+rna_data+'.csv', index_col=False).drop(['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'], axis=1)
  cols = df.columns

  mad = dict()
  for i in df.columns:
    mad[i] = stats.median_absolute_deviation(df.loc[:,i], scale=1) / np.median(df.loc[:,i])
  mad_df = pd.DataFrame.from_dict(data=mad, orient='index').dropna()
  mad_df.columns = pd.Index(['MAD'])
  mad_df.sort_values(by='MAD', ascending=False)

  count = 1
  mad_sum = mad_df.loc[:, 'MAD'].sum()
  while mad_df.iloc[0:count, 0].sum() < threshold*mad_sum:
    count += 1
  
  df = pd.concat([sid_and_class, df.iloc[:, 0:count]], axis=1, ignore_index=True)
  df.columns = pd.Index(['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'] + list(cols[0:count]))
  return df

In [None]:
def only_genus():
  gut_data = 'gut_16s_abundance'
  nares_data = 'nares_16s_abundance'

  for data_path in ['', 'WT Domain/', 'Denoised/', 'Normalized/',
                    'Normalized/WT Domain/', 'Normalized/Denoised/']:
    try:
      os.mkdir(preprocessed_data_path+'Genus only/')
    except FileExistsError:
      pass

    for j in [gut_data, nares_data]:
      sid_and_class = pd.read_csv(preprocessed_data_path+data_path+j+'.csv', index_col=False,
                        usecols=['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'])
      df = pd.read_csv(preprocessed_data_path+data_path+j+'.csv', index_col=False).drop(['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'], axis=1)
      df_cols_to_keep = [i for i in df.columns if i[0:5] == 'genus']
      
      df = df.loc[:,df_cols_to_keep]
      cols = ['SampleID', 'Race', 'Sex', 'Age', 'BMI', 'SSPG', 'IR_IS_classification'] + list(df_cols_to_keep)
      df = pd.concat([sid_and_class, df], axis=1, ignore_index=True)
      df.columns = pd.Index(cols)
      
      os.makedirs(preprocessed_data_path+'Genus only/'+data_path, exist_ok=True)
      df.to_csv(preprocessed_data_path+'Genus only/'+data_path+j+'.csv', index=False)