In [17]:
import os
import numpy as np 
import glob
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from utils import project, calculate_Q_mae

from warnings import simplefilter
simplefilter('ignore')

%matplotlib inline

# Load and preprocess data

In [5]:
data_orig = {}

for dataset_name in tqdm(['AGP', 'HMP']):
    for tax in ['o', 'f', 'g']: 
        dataframe = pd.read_csv(f'./data/{dataset_name}/pivot_{tax}_normalized.csv', sep=';')
        label = f'{dataset_name}_{tax}'
        data_orig[label] = dataframe.drop('Unnamed: 0', axis=1)

processed_root = 'data_processed'
pca_root = './results/pca/' 
os.makedirs('data_processed', exist_ok=True)
os.makedirs(pca_root, exist_ok=True)

In [18]:
# # uncomment to use synthetic data

# data_orig = {}
# for path in glob.glob('data/synthetic/*'):
#     dataframe = pd.read_csv(path, index_col=0)
#     label = path.split('/')[-1]
#     data_orig[label] = dataframe.drop('target', axis=1)

# processed_root = 'data_processed/synthetic'
# pca_root = './results/pca/synthetic'
# os.makedirs(processed_root, exist_ok=True)
# os.makedirs(pca_root, exist_ok=True)

# Preprocessing

In [19]:
RARITY_THRESHOLD = 0.01
STD_THRESHOLD = 1e-3

preprocessed_data = {}

for label, df in data_orig.items():
            
    df_proc = df.copy()
    df_proc.drop_duplicates(inplace=True)
    N = df_proc.shape[0]
    # too rare
    rare_otu_mask = (df_proc > 0).sum(0) / N < RARITY_THRESHOLD
    # too low std
    std_otu_mask = df_proc.std(0) < STD_THRESHOLD
    mask = rare_otu_mask * std_otu_mask
    df_proc = df_proc.iloc[:,~mask.values]
    preprocessed_data[label] = df_proc
    
    path = os.path.join(processed_root, f'{label}.csv')
    df_proc.to_csv(path)
    
    print(f'{label} Orig shape: {df.shape}, processed shape: {df_proc.shape}')

clust2_dim53 Orig shape: (3000, 53), processed shape: (3000, 53)
clust3_dim96 Orig shape: (3000, 96), processed shape: (3000, 96)
clust2_dim96 Orig shape: (3000, 96), processed shape: (3000, 96)
clust3_dim53 Orig shape: (3000, 53), processed shape: (3000, 53)
clust4_dim180 Orig shape: (3000, 180), processed shape: (3000, 180)
clust2_dim180 Orig shape: (3000, 180), processed shape: (3000, 180)
clust4_dim53 Orig shape: (3000, 53), processed shape: (3000, 53)
clust4_dim96 Orig shape: (3000, 96), processed shape: (3000, 96)
clust3_dim180 Orig shape: (3000, 180), processed shape: (3000, 180)


# PCA

In [25]:
SAVE=True

plt.figure()
plt.title(f"Cumulative explained variance", fontsize=12)
plt.xlabel("Number of PCs", fontsize=12)
plt.hlines(0.99,0,300,linestyle='--')

plt.grid(linestyle="dotted")

for label, df in preprocessed_data.items():
    
    data_projected, pca, pca_proj, mae = project(df)
       
    d_pca = data_projected.shape[1]
    d = df.shape[1]
    
    plt.plot(np.cumsum(pca.explained_variance_ratio_), 
            linewidth=1, 
            label=label,
            alpha=0.6)  
    
    Q_loc, Q_glob, _ = calculate_Q_mae(df.values, data_projected)
    
    
    print(f'For {label}, dim orig: {d}, dim PCA: {d_pca}, mae: {np.round(mae,3)}, Q_loc: {np.round(Q_loc,3)}, Q_glob: {np.round(Q_glob,3)}')
    
    if SAVE:
        path = os.path.join(pca_root, f'{label}')
        np.savetxt(path, data_projected, delimiter = ';')

plt.xscale('log')
plt.legend(fontsize=12)       
plt.show()    