In [1]:
from collections import defaultdict

import os
import warnings
from pathlib import Path

import pandas as pd
import patsy
import numpy as np
import seaborn as sns
import time

import matplotlib.pyplot as plt
import sklearn.manifold as skmf
import sklearn.decomposition as skdc
import sklearn.metrics as skmr

import condo

from combat import combat

In [2]:
this_file = os.path.realpath('__file__')
data_path = os.path.join(Path(this_file).parent.parent, 'data')
data_path = '/Users/calvinm/sandbox/condo-adapter/data/'
all_pheno = pd.read_csv(os.path.join(data_path, 'pheno25.csv'), index_col=0)
all_expr = pd.read_csv(os.path.join(data_path, 'expr25.csv'), index_col=0)

num_random = 10
mmd_size = 20
sil_results = defaultdict(list)
sil_batches = defaultdict(list)
ch_results = defaultdict(list)
ch_batches = defaultdict(list)
db_results = defaultdict(list)
db_batches = defaultdict(list)

num_removecancer = 7

In [3]:
for rix in range(num_random):
    print(f"rix:{rix} {num_removecancer}")
    rng = np.random.RandomState(rix)
    cancer2_ixs = np.where((all_pheno.batch == 2) & (all_pheno.result == "Cancer"))[0]
    victim_ixs = list(rng.choice(cancer2_ixs, size=num_removecancer, replace=False))
    chosen_ixs = [ix for ix in range(all_pheno.shape[0]) if ix not in victim_ixs]
    pheno = all_pheno.iloc[chosen_ixs, :]
    expr = all_expr.iloc[chosen_ixs, :]
    exprTranspose = expr.T

    # Combat
    start_time = time.time()
    mod = patsy.dmatrix("~ age + cancer", pheno, return_type="dataframe")
    exprTranspose_combat = combat(exprTranspose, pheno['batch'], mod, "age")
    expr_combat = exprTranspose_combat.T
    duration_combat = time.time() - start_time
    #print(f"Combat time: {duration_combat}")
    
    # Gaussian OT
    start_time = time.time()
    lder = condo.AdapterGaussianOT(
        transform_type="location-scale",
    )
    lder.fit(
        expr[pheno.batch == 2].values, 
        expr[pheno.batch == 5].values,
    )
    expr_linear = lder.transform(expr.values)
    duration_linear = time.time() - start_time
    #print(f"Gaussian OT time: {duration_linear}")
    expr_linear[np.where(pheno.batch == 5)[0],:] = expr.values[np.where(pheno.batch == 5)[0],:]

    # MMD
    start_time = time.time()
    mmder = condo.AdapterMMD(
        transform_type="location-scale",
        n_epochs=100,
        learning_rate=1e-2,
        mmd_size=mmd_size,
        verbose=False,
    )
    mmder.fit(
        expr[pheno.batch == 2].values, 
        expr[pheno.batch == 5].values,
    )
    expr_mmd = mmder.transform(expr.values)
    duration_mmd = time.time() - start_time
    #print(f"MMD time: {duration_mmd}")
    expr_mmd[np.where(pheno.batch == 5)[0],:] = expr.values[np.where(pheno.batch == 5)[0],:]
    
    # Condo Linear ReverseKL
    start_time = time.time()
    cder_clinear = condo.ConDoAdapterKLD(
        transform_type="location-scale",
        verbose=0,
    )
    cder_clinear.fit(
        expr[pheno.batch == 2].values, 
        expr[pheno.batch == 5].values,
        pheno[pheno.batch==2].result.values.reshape(-1, 1).astype(str),
        pheno[pheno.batch==5].result.values.reshape(-1, 1).astype(str),
    )
    expr_clinear = cder_clinear.transform(expr.values)
    duration_clinear = time.time() - start_time
    #print(f"ConDo Linear-ReverseKL time: {duration_clinear}")
    expr_clinear[np.where(pheno.batch == 5)[0],:] = expr.values[np.where(pheno.batch == 5)[0],:]

    start_time = time.time()
    cder_mmd = condo.ConDoAdapterMMD(
        transform_type="location-scale",
        n_epochs=100,
        learning_rate=1e-2,
        mmd_size=mmd_size,
        verbose=False,
    )
    cder_mmd.fit(
        expr[pheno.batch == 2].values, 
        expr[pheno.batch == 5].values,
        pheno[pheno.batch==2].result.values.reshape(-1, 1).astype(str),
        pheno[pheno.batch==5].result.values.reshape(-1, 1).astype(str),
    )
    expr_cmmd = cder_mmd.transform(expr.values)
    duration_cmmd = time.time() - start_time
    #print(f"ConDo MMD time: {duration_cmmd}")
    expr_cmmd[np.where(pheno.batch == 5)[0],:] = expr.values[np.where(pheno.batch == 5)[0],:]
    
    dinfos = [
        (0, "Original", expr),
        (1, "Combat", expr_combat),
        (1, "Gaussian OT", expr_linear),
        (2, "MMD", expr_mmd),
        (3, "ConDo Gaussian KLD", expr_clinear),
        (4, "ConDo MMD", expr_cmmd),
    ]
    for dix, dname, dset in dinfos:
        sil_result = skmr.silhouette_score(dset, pheno.result, metric='euclidean')
        sil_batch = skmr.silhouette_score(dset, pheno.batch, metric='euclidean')
        sil_results[dname].append(sil_result)
        sil_batches[dname].append(sil_batch)
        ch_result = skmr.calinski_harabasz_score(dset, pheno.result)
        ch_batch = skmr.calinski_harabasz_score(dset, pheno.batch)
        ch_results[dname].append(ch_result)
        ch_batches[dname].append(ch_batch)
        db_result = skmr.davies_bouldin_score(dset, pheno.result)
        db_batch = skmr.davies_bouldin_score(dset, pheno.batch)
        db_results[dname].append(db_result)
        db_batches[dname].append(db_batch)

rix:0 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


rix:1 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


rix:2 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


rix:3 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


rix:4 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


rix:5 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


rix:6 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


rix:7 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


rix:8 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


rix:9 7
Adjusting data


found 2 batches
found 1 numerical covariates...
	age
found 2 categorical variables:	cancer[T.Cancer], cancer[T.Normal]
Standardizing Data across genes.
Fitting L/S model and finding priors
Finding parametric adjustments


In [4]:
print("batches")
for key in ch_batches:
    print(key, np.mean(ch_batches[key]), np.std(ch_batches[key]))
print("results")
for key in ch_results:
    print(key, np.mean(ch_results[key]), np.std(ch_results[key]))

batches
Original 4.765380130693893 0.3855893756573822
Combat 0.7650249597113311 0.08020651095779471
Gaussian OT 1.3075438883305155e-28 9.483918223207937e-31
MMD 0.14170251750011237 0.02840759941655643
ConDo Gaussian KLD 0.40372109198038036 0.013264431466850225
ConDo MMD 0.3628903001377174 0.053638252876084265
results
Original 14.066077188366407 0.5033701478082329
Combat 16.552256306492048 0.5611561602631688
Gaussian OT 13.832653571690628 0.35669772434848757
MMD 14.085305475963503 0.578474881871385
ConDo Gaussian KLD 15.62164678895152 0.4774149878751159
ConDo MMD 15.525582134058165 0.5167555321052653


In [5]:
print("batches")
for key in sil_batches:
    print(key, np.mean(sil_batches[key]), np.std(sil_batches[key]))
print("results")
for key in sil_results:
    print(key, np.mean(sil_results[key]), np.std(sil_results[key]))

batches
Original 0.09817462504532037 0.009996373365398032
Combat -0.00915689602412837 0.0034784384893987805
Gaussian OT -0.03285691170350864 0.0005174882734027204
MMD -0.037215044585352915 0.0009206412553440588
ConDo Gaussian KLD -0.027573444689279192 0.0015333145211920865
ConDo MMD -0.028128536364997063 0.001889300537962599
results
Original 0.2984110112033573 0.008767660649062647
Combat 0.3342038234636949 0.0076011308366087165
Gaussian OT 0.2934164448292405 0.005757407923421007
MMD 0.2965584848147838 0.00968056296525825
ConDo Gaussian KLD 0.32116720796038767 0.007203069152339294
ConDo MMD 0.31955900545904786 0.007960047943809466
