In [1]:
import os
import warnings
from pathlib import Path

import pandas as pd
import patsy
import numpy as np
import seaborn as sns
import time

import matplotlib.pyplot as plt
import sklearn.manifold as skmf
import sklearn.decomposition as skdc
import sklearn.metrics as skmr

import condo

from combat import combat

In [2]:
this_file = os.path.realpath('__file__')
data_path = os.path.join(Path(this_file).parent.parent, 'data')
data_path = '/Users/calvinm/sandbox/condo-adapter/data/'
pheno = pd.read_csv(os.path.join(data_path, 'pheno25.csv'), index_col=0)
expr = pd.read_csv(os.path.join(data_path, 'expr25.csv'), index_col=0)
exprTranspose = expr.T

mmd_size = 20

In [3]:
# Combat
start_time = time.time()
mod = patsy.dmatrix("~ age + cancer", pheno, return_type="dataframe")
exprTranspose_combat = combat(exprTranspose, pheno['batch'], mod, "age")
expr_combat = exprTranspose_combat.T
duration_combat = time.time() - start_time
print(f"Combat time: {duration_combat}")

# Gaussian OT
start_time = time.time()
lder = condo.AdapterGaussianOT(
    transform_type="location-scale",
)
lder.fit(
    expr[pheno.batch == 2].values, 
    expr[pheno.batch == 5].values,
)
expr_linear = lder.transform(expr.values)
duration_linear = time.time() - start_time
print(f"Gaussian OT time: {duration_linear}")
expr_linear[np.where(pheno.batch == 5)[0],:] = expr.values[np.where(pheno.batch == 5)[0],:]

found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


Adjusting data
Combat time: 0.20929408073425293
Gaussian OT time: 0.020589113235473633


In [4]:
# MMD
start_time = time.time()
mmder = condo.AdapterMMD(
    transform_type="location-scale",
    n_epochs=100,
    learning_rate=1e-2,
    mmd_size=mmd_size,
    verbose=False,
)
mmder.fit(
    expr[pheno.batch == 2].values, 
    expr[pheno.batch == 5].values,
)
expr_mmd = mmder.transform(expr.values)
duration_mmd = time.time() - start_time
print(f"MMD time: {duration_mmd}")
expr_mmd[np.where(pheno.batch == 5)[0],:] = expr.values[np.where(pheno.batch == 5)[0],:]

MMD time: 17.95270586013794


In [5]:
# Condo Gaussian KLD
start_time = time.time()
cder_clinear = condo.ConDoAdapterKLD(
    transform_type="location-scale",
    verbose=0,
)
cder_clinear.fit(
    expr[pheno.batch == 2].values, 
    expr[pheno.batch == 5].values,
    pheno[pheno.batch==2].result.values.reshape(-1, 1).astype(str),
    pheno[pheno.batch==5].result.values.reshape(-1, 1).astype(str),
)
expr_clinear = cder_clinear.transform(expr.values)
duration_clinear = time.time() - start_time
print(f"ConDo Gaussian KLD time: {duration_clinear}")
expr_clinear[np.where(pheno.batch == 5)[0],:] = expr.values[np.where(pheno.batch == 5)[0],:]

ConDo Gaussian KLD time: 2.770442008972168


In [6]:
start_time = time.time()
cder_mmd = condo.ConDoAdapterMMD(
    transform_type="location-scale",
    n_epochs=100,
    learning_rate=1e-2,
    mmd_size=mmd_size,
    verbose=False,
)
cder_mmd.fit(
    expr[pheno.batch == 2].values, 
    expr[pheno.batch == 5].values,
    pheno[pheno.batch==2].result.values.reshape(-1, 1).astype(str),
    pheno[pheno.batch==5].result.values.reshape(-1, 1).astype(str),
)
expr_cmmd = cder_mmd.transform(expr.values)
duration_cmmd = time.time() - start_time
print(f"ConDo MMD: {duration_cmmd}")
expr_cmmd[np.where(pheno.batch == 5)[0],:] = expr.values[np.where(pheno.batch == 5)[0],:]

ConDo MMD: 17.710924863815308


In [11]:
dinfos = [
    (0, "Original", expr),
    (1, "Combat", expr_combat),
    (2, "Gaussian OT", expr_linear),
    (3, "MMD", expr_mmd),
    (4, "ConDo Gaussian KLD", expr_clinear),
    (5, "ConDo MMD", expr_cmmd),
]
for dix, dname, dset in dinfos:
    sil_result = skmr.silhouette_score(dset, pheno.result, metric='euclidean')
    sil_batch = skmr.silhouette_score(dset, pheno.batch, metric='euclidean')
    dtitle = f"{dname}\n{sil_batch:.2f} (batch), {sil_result:.2f} (result)"
    print(dname, sil_batch, sil_result)
print("\n")
for dix, dname, dset in dinfos:
    ch_result = skmr.calinski_harabasz_score(dset, pheno.result)
    ch_batch = skmr.calinski_harabasz_score(dset, pheno.batch)
    dtitle = f"{dname}\n{sil_batch:.2f} (batch), {sil_result:.2f} (result)"
    print(dname, ch_batch, ch_result)

Original 0.08843292720229044 0.27981748504543436
Combat -0.0177958844661008 0.3267963793654045
Gaussian OT -0.027900412797362262 0.30077944884881963
MMD -0.021541523591874844 0.3123660135882727
ConDo Gaussian KLD -0.019478039144033576 0.3120783679442728
ConDo MMD -0.01876002782916628 0.3111986239531965


Original 4.85355038775955 13.889936702601817
Combat 0.34951560406815835 17.39402961266026
Gaussian OT 1.787392267802455e-28 15.614208154753587
MMD 0.08831952187482882 16.402574782099332
ConDo Gaussian KLD 0.13319091386745463 16.376403524532925
ConDo MMD 0.12251326673970811 16.416689472565682
