# Example of Phylomix augmentation

### Import required packages

In [1]:
import sys
sys.path.insert(1, '../src/')


from data import PhylogenyTree, PhylogenyDataset
from ete4 import Tree
from mixup import Mixup
dataset = "alzbiom"
target = "ad"

data_fp = f'../data/{dataset}/data.tsv.xz'
meta_fp = f'../data/{dataset}/meta.tsv'
target_fp = f'../data/{dataset}/{target}.py'

phylogeny_tree_fp = '../data/WoL2/phylogeny.nwk'
tree = PhylogenyTree.init_from_nwk(phylogeny_tree_fp)
data = PhylogenyDataset.init_from_files(data_fp, meta_fp, target_fp)
print("number of leaves in the phylogeny tree: ", len(list(tree.ete_tree.leaves())))

number of leaves in the phylogeny tree:  15953


### Pruning the Phylogeny tree to make sure number of leaves is the same as number of features

In [2]:
data = PhylogenyDataset.init_from_files(data_fp, meta_fp, target_fp)
tree.prune(data.features)
print("number of leaves in the phylogeny tree after pruning: ", len(list(tree.ete_tree.leaves())))

number of leaves in the phylogeny tree after pruning:  8350


### Fit a logistic regression on the alzbiom dataset

In [8]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.preprocessing import label_binarize
import warnings
from sklearn.exceptions import ConvergenceWarning
data = PhylogenyDataset.init_from_files(data_fp, meta_fp, target_fp)
# Assuming X and y are your features and labels respectively
# Ignore convergence warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
data.clr_transform()
X, y = data.X, data.y

# Initialize the logistic regression model
model = LogisticRegression()

# Initialize the KFold class with 5 splits
kf = KFold(n_splits=5, shuffle=True, random_state=1)

# To store the AUROC and AUPRC for each fold
auroc_scores = []
auprc_scores = []

# Perform 5-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    

    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    auroc = roc_auc_score(y_test, y_prob)
    auroc_scores.append(auroc)
    
    auprc = average_precision_score(y_test, y_prob)
    auprc_scores.append(auprc)

average_auroc = np.mean(auroc_scores)
average_auprc = np.mean(auprc_scores)

print(f'Average AUROC across 5 folds: {average_auroc}')
print(f'Average AUPRC across 5 folds: {average_auprc}')


Average AUROC across 5 folds: 0.6679730327218607
Average AUPRC across 5 folds: 0.6388010371968438


### Augment the dataset using Phylomix

In [13]:
data = PhylogenyDataset.init_from_files(data_fp, meta_fp, target_fp)
data.one_hot_encode()
mixup = Mixup(data, taxonomy_tree=None, phylogeny_tree=tree)
augmented_data = mixup.mixup(num_samples = 3 * len(data.X), method='phylomix', alpha=2.0, tree='phylogeny')
augmented_data.clr_transform()

In [14]:
from sklearn.linear_model import LinearRegression   
 # Initialize the KFold class
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=1)
X, y = augmented_data.X, augmented_data.y
# Initialize lists to store AUROC and AUPRC scores
auroc_scores = []
auprc_scores = []

# Perform K-fold cross-validation
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit a linear regression model to predict the first label (y[:, 0])
    model = LinearRegression()
    model.fit(X_train, y_train[:, 0])

    # Predict the first label
    y_pred_first =np.array(model.predict(X_test)).reshape(-1, 1)

    # Recover the second label as 1 - predicted first label
    y_pred_second = 1 - y_pred_first
    pred_logits = np.concatenate((y_pred_first, y_pred_second), axis=1)

    # Since we're dealing with binary labels, we only need to evaluate one label (y[:, 0])
    # Calculate AUROC and AUPRC for the first label
    auroc = roc_auc_score(np.argmax(y_test, axis=1), np.argmax(pred_logits, axis=1))
    auprc = average_precision_score(np.argmax(y_test, axis=1), np.argmax(pred_logits, axis=1))

    auroc_scores.append(auroc)
    auprc_scores.append(auprc)

# Calculate the average AUROC and AUPRC across all folds
average_auroc = np.mean(auroc_scores)
average_auprc = np.mean(auprc_scores)

print(f'Average AUROC across {n_splits} folds: {average_auroc}')
print(f'Average AUPRC across {n_splits} folds: {average_auprc}')


Average AUROC across 5 folds: 0.930143569373171
Average AUPRC across 5 folds: 0.8781954884367587
