# 01 - Data Preprocessing

## A - Libraries

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from metaboDGD.util import data

## B - Retrieving Separate Datasets (NORMAL)

In [2]:
# Combine the CAMP cohort datasets to one dataframe
df, cohorts = data.combine_cohort_datasets('Normal')

In [3]:
# Convert the df to a numpy array of dim (# samples, # metabolites)
np_df = df.T.to_numpy()[:, :-1].astype(np.float64)

# Get cells that have a 0.0
np_df_zm = (np_df == 0)

# Exponentiate the df by 2
np_exp = np.exp2(np_df)

# Retain the 0.0 values
np_exp[np_df_zm] = 0.0

# Recreate the exponent version of the dataframe
df_rows = list(df.T.index)
df_cols = list(df.T.columns)
df_exp = pd.DataFrame(np_exp, index=df_rows, columns=df_cols[:-1])


In [4]:
dir = 'outputs/'
if not os.path.exists(dir):
    os.makedirs(dir)

df_fname = 'CombinedDataset_CAMP_Normal.csv'
df.to_csv(dir + df_fname)

# df_exp_fname = 'Exponent_CombinedDataset_CAMP_Normal.csv'
# df_exp.to_csv(dir + df_exp_fname)

ch_fname = 'cohorts_Normal.pkl'
f = open(dir + ch_fname, 'wb')
pickle.dump(cohorts, f)
f.close()

## C - Retrieving Separate Datasets (TUMOR)

In [5]:
df_tumor, cohorts_tumor = data.combine_cohort_datasets('Tumor')

In [6]:
# Drop columns not present in the list of features in df
df_tumor = df_tumor.drop(index=list(set(df_tumor.index.to_list()) - set(df.index.to_list())))

# Get remaining columns in df not present in df_tumor
in_df_not_in_tumor = list(set(df.index.to_list()) - set(df_tumor.index.to_list()))

# Union and initialize to zero
df_tumor = df_tumor.reindex(df_tumor.index.union(in_df_not_in_tumor))
df_tumor.loc[in_df_not_in_tumor] = 0.0

In [7]:
dir = 'outputs/'
if not os.path.exists(dir):
    os.makedirs(dir)

df_fname = 'CombinedDataset_CAMP_Tumor.csv'
df_tumor.to_csv(dir + df_fname)

# df_exp_fname = 'Exponent_CombinedDataset_CAMP.csv'
# df_exp.to_csv(dir + df_exp_fname)

ch_fname = 'cohorts_Tumor.pkl'
f = open(dir + ch_fname, 'wb')
pickle.dump(cohorts_tumor, f)
f.close()