# 01 - Data Preprocessing

## I. Preliminaries

Import the libraries and functions for data preprocessing.

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from metaboDGD.util import data

Create an `outputs` directory for storing the output files.

In [None]:
dir = 'outputs/'
if not os.path.exists(dir):
    os.makedirs(dir)

## II. Retrieving metabolite abundance data

### A. Get abundance data from adjacent normal tissue samples

In [None]:
# Combine the metabolite abundance datasets to one dataframe
df, cohorts = data.combine_cohort_datasets('Normal')

# Convert dataframe to a numpy array of shape (no. of samples, no. of metabolites)
np_df = df.T.to_numpy()[:, :-1].astype(np.float64)

In [None]:
# Save dataframe as a CSV file
df_fname = 'CombinedDataset_CAMP_Normal.csv'
df.to_csv(dir + df_fname)

# Save cohorts dictionary as a pickle file
ch_fname = 'cohorts_Normal.pkl'
f = open(dir + ch_fname, 'wb')
pickle.dump(cohorts, f)
f.close()

### B. Get abundance data from tumor tissue samples

In [None]:
# Combine the metabolite abundance datasets to one dataframe
df_tumor, cohorts_tumor = data.combine_cohort_datasets('Tumor')

# Drop columns not present in the list of metabolites in the dataframe
df_tumor = df_tumor.drop(index=list(set(df_tumor.index.to_list()) - set(df.index.to_list())))

# Get remaining columns in dataframe not present in df_tumor
in_df_not_in_tumor = list(set(df.index.to_list()) - set(df_tumor.index.to_list()))

# Union and initialize to zero
df_tumor = df_tumor.reindex(df_tumor.index.union(in_df_not_in_tumor))
df_tumor.loc[in_df_not_in_tumor] = 0.0

In [None]:
# Save dataframe as a CSV file
df_fname = 'CombinedDataset_CAMP_Tumor.csv'
df_tumor.to_csv(dir + df_fname)

# Save cohorts dictionary as a pickle file
ch_fname = 'cohorts_Tumor.pkl'
f = open(dir + ch_fname, 'wb')
pickle.dump(cohorts_tumor, f)
f.close()

## III. - MarkerDB

In [None]:
diag_df = pd.read_csv('all_chemicals.tsv', delimiter='\t', encoding='latin-1')
# diag_df = pd.read_csv('all_diagnostic_chemicals.tsv', delimiter='\t', encoding='latin-1')
cancer_df = diag_df[diag_df['conditions'].str.contains('cancer', case=False, na=False)]

In [None]:
cancer_df['conditions'].value_counts()

In [None]:
cancer_df.loc[cancer_df['conditions'] == 'Prostate Cancer']