# 01 - Data Preprocessing

## I. Preliminaries

Import the libraries and functions for data preprocessing.

In [1]:
import os
import pickle
import pandas as pd
import numpy as np
from metaboDGD.util import data

Create an `outputs` directory for storing the output files.

In [2]:
dir = 'sept_outputs/'
if not os.path.exists(dir):
    os.makedirs(dir)

## II. Retrieving metabolite abundance data

### A. Get abundance data from adjacent normal tissue samples

In [3]:
# Combine the metabolite abundance datasets to one dataframe
df, cohorts = data.combine_cohort_datasets('Normal')

# Convert dataframe to a numpy array of shape (no. of samples, no. of metabolites)
np_df = df.T.to_numpy()[:, :-1].astype(np.float64)

In [4]:
# Save dataframe as a CSV file
df_fname = 'CombinedDataset_CAMP_Normal.csv'
df.to_csv(dir + df_fname)

# Save cohorts dictionary as a pickle file
ch_fname = 'cohorts_Normal.pkl'
f = open(dir + ch_fname, 'wb')
pickle.dump(cohorts, f)
f.close()

In [5]:
metabolite_list = df.index.to_list()
f = open('metabolite_list.txt', 'w')
for m in metabolite_list:
    f.write(f'{m}\n')

f.close()

In [6]:
normal_list = df.columns.to_list()
f = open('normal_sample_list.txt', 'w')
for m in normal_list:
    f.write(f'{m}\n')

f.close()

### B. Get abundance data from tumor tissue samples

In [7]:
# Combine the metabolite abundance datasets to one dataframe
df_tumor, cohorts_tumor = data.combine_cohort_datasets(['Disease', 'Tumor'])

# Drop columns not present in the list of metabolites in the dataframe
df_tumor = df_tumor.drop(index=list(set(df_tumor.index.to_list()) - set(df.index.to_list())))

# Get remaining columns in dataframe not present in df_tumor
in_df_not_in_tumor = list(set(df.index.to_list()) - set(df_tumor.index.to_list()))

# Union and initialize to zero
df_tumor = df_tumor.reindex(df_tumor.index.union(in_df_not_in_tumor))
df_tumor.loc[in_df_not_in_tumor] = 0.0

In [8]:
# Save dataframe as a CSV file
df_fname = 'CombinedDataset_CAMP_Tumor.csv'
df_tumor.to_csv(dir + df_fname)

# Save cohorts dictionary as a pickle file
ch_fname = 'cohorts_Tumor.pkl'
f = open(dir + ch_fname, 'wb')
pickle.dump(cohorts_tumor, f)
f.close()

In [9]:
tumor_list = df_tumor.columns.to_list()
f = open('disease_sample_list.txt', 'w')
for m in tumor_list:
    f.write(f'{m}\n')

f.close()

## III. - MarkerDB

In [None]:
diag_df = pd.read_csv('all_chemicals.tsv', delimiter='\t', encoding='latin-1')
# diag_df = pd.read_csv('all_diagnostic_chemicals.tsv', delimiter='\t', encoding='latin-1')
cancer_df = diag_df[diag_df['conditions'].str.contains('cancer', case=False, na=False)]

In [None]:
cancer_df['conditions'].value_counts()

In [None]:
cancer_df.loc[cancer_df['conditions'] == 'Prostate Cancer']

## ???

In [None]:
nz_percentage = 1 - (np.count_nonzero(np_normal_log, axis=0) / np_normal_log.shape[0])
print(np.min(nz_percentage), np.max(nz_percentage))
plt.hist(nz_percentage, bins=100)
plt.xlabel('Fraction of Zero values')
plt.title('Frequency of Zero Abundance Values')
plt.show()

In [None]:
# idxs = np.r_[257:654, 843:1153]
# np_normal_log[idxs].T.shape

fig, ax = plt.subplots(figsize=(4, 12))
img = ax.imshow(np_normal_log.T, aspect='auto')
cbar = fig.colorbar(img, ax=ax)
cbar.set_label("Log10 Abundance")
ax.set_xlabel('Samples')
ax.set_ylabel('Metabolites')
plt.show()