In [3]:
import subprocess
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ParameterGrid
import os

# Define the paths to the table, metadata, and output directories
#original_data<-"~/from_pendrive/PhD/DAA_benchmark_study/qiime_ancombc/data/PD_10483/otu_pd_qiime.tsv"
#simulated_data<-"/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/otu_table1.txt"
#biom_file <- '/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/otu_table1.biom'
output_qza='/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/otu_table1.qza'
metadata="/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/meta_table1.tsv"
ancomoutput='/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/ancombc.qza'
exported='/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/exported_ancompd'
truedaa="/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/truedaa.tsv"
tr = pd.read_csv(truedaa,sep="\t", index_col=0)
# Define a function to run QIIME 2's ANCOM-BC with system calls
def run_ancombc(table_path, metadata_path, formula, max_iter, tol, alpha, lib_cut, output_path):
    cmd = [
        "qiime", "composition", "ancombc",
        "--i-table", table_path,
        "--m-metadata-file", metadata_path,
        "--p-formula", formula,
       # "--p-p-adj-method", p_adj_method,
        "--p-max-iter", str(max_iter),
        "--p-tol", str(tol),
        "--p-alpha", str(alpha),
        "--p-lib-cut", str(lib_cut),
        "--o-differentials", output_path
    ]
    subprocess.run(cmd, check=True)

# Define the parameter grid for tuning
param_grid = {
    #'p_adj_method': ['holm', 'hochberg', 'hommel', 'bonferroni', 'BH', 'BY', 'fdr'],
     'tol' : [1e-03, 1e-04, 1e-05, 1e-06],
     'max_iter': [50, 75, 100, 125, 150, 175, 200],
   #  'zero_cut' : np.arange(0.1, 1.1, 0.1),
     'alpha' : np.arange(0.01, 0.1, 0.01),
     'lib_cut' : np.arange(0, 100, 10)
}

# Convert parameter grid to list of dictionaries
param_list = list(ParameterGrid(param_grid))

# Placeholder for results
results = []
k=0
# Run tuning
for params in param_list:
    try:
        run_ancombc(
            table_path=output_qza,
            metadata_path=metadata,
            formula='covariate',
            #p_adj_method=params['p_adj_method'],
            
            max_iter=params['max_iter'],
            tol=params['tol'],
           # zero_cut=params['zero_cut'],
            alpha=params['alpha'],
            lib_cut=params['lib_cut'],
            output_path=ancomoutput
        )

        # Export the results from QIIME2
        subprocess.run([
            'qiime', 'tools', 'export',
            '--input-path', ancomoutput,
            '--output-path', exported
        ], check=True)

        # Construct file path for q_val_slice.csv
        q_val_csv = os.path.join(exported, 'q_val_slice.csv')

        # Check if the file exists before reading
        if not os.path.exists(q_val_csv):
            print(f"Expected q_val_slice.csv does not exist at path: {q_val_csv}")
        else:
            # Read the q-value results
            ancom_q_val = pd.read_csv(q_val_csv, index_col=0)

            # take only necessary data
            ancom_q_val = ancom_q_val[['covariateunhealthy']]
    
            # construct output dataframe
            otu_tab_sim_index = pd.Index(ancom_q_val.index)  # assuming otu_tab_sim is already defined
            missing = ~otu_tab_sim_index.isin(ancom_q_val.index)
            missing_index = otu_tab_sim_index[missing]
            nqv = pd.DataFrame({'covariateunhealthy': [None] * len(missing_index)}, index=missing_index)
            q_val_all = pd.concat([ancom_q_val, nqv])
            q_val_all.rename(columns={'covariateunhealthy': 'q_value'}, inplace=True)
            pred = (ancom_q_val['covariateunhealthy'] < 0.05).astype(int).replace({0: 'NOT_DA', 1: 'DA'})
            pred_df = pd.DataFrame({'pred': pred})
    
            # Assume tr is already defined and is a DataFrame
            out_df = pd.concat([pred_df, q_val_all, tr], axis=1)

            # Calculate metrics
            out_df['truth'] = out_df['truth'].apply(lambda x: 1 if x == 'DA' else 0)
            out_df['pred'] = out_df['pred'].apply(lambda x: 1 if x == 'DA' else 0)
            FP = sum((out_df['truth'] == 0) & (out_df['pred'] == 1))
            FN = sum((out_df['truth'] == 1) & (out_df['pred'] == 0))
            TP = sum((out_df['truth'] == 1) & (out_df['pred'] == 1))
            TN = sum((out_df['truth'] == 0) & (out_df['pred'] == 0))

            precision = TP / (TP + FP) if TP + FP > 0 else None
            recall = TP / (TP + FN) if TP + FN > 0 else None
            fpr = FP / (FP + TN) if FP + TN > 0 else None
            f1_score = 2 * (recall * precision) / (precision + recall) if precision + recall > 0 else None
            performance_score = (TP + TN) / (TP + TN + FP + FN)
             
            results.append((params, performance_score))
        k=k+1
        if(k>=10):
            break
    except subprocess.CalledProcessError as e:
        print(f"Error running ANCOM-BC: {e}")

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['params', 'performance_score'])

# Extract the hyperparameters and scores
X = results_df['params'].apply(pd.Series)
y = results_df['performance_score']

# Fit the Decision Tree Regressor
regressor = DecisionTreeRegressor()
regressor.fit(X, y)

# Now you can analyze the regressor to find out which hyperparameters are important
print(regressor.feature_importances_)

# Predict with the regressor to find the best hyperparameters
best_params_index = np.argmax(regressor.predict(X))
best_params = X.iloc[best_params_index]
print(f"Best hyperparameters: {best_params.to_dict()}")


Saved FeatureData[DifferentialAbundance] to: /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/ancombc.qza
Exported /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/ancombc.qza as DataLoafPackageDirFmt to directory /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/exported_ancompd
Saved FeatureData[DifferentialAbundance] to: /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/ancombc.qza
Exported /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/ancombc.qza as DataLoafPackageDirFmt to directory /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/exported_ancompd
Saved FeatureData[DifferentialAbundance] to: /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/ancombc.qza
Exported /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/ancombc.qza as DataLoafPackageDirFmt to directory /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/exported_ancomp

In [17]:
#Best hyperparameters: {'alpha': 0.01, 'lib_cut': 0.0, 'max_iter': 50.0, 'tol': 0.001}
#Best hyperparameters: {'alpha': 0.01, 'lib_cut': 0.0, 'max_iter': 50.0, 'tol': 0.001}

In [8]:
import subprocess
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import ParameterGrid
import os

# Define the paths to the table, metadata, and output directories
#original_data<-"~/from_pendrive/PhD/DAA_benchmark_study/qiime_ancombc/data/PD_10483/otu_pd_qiime.tsv"
#simulated_data<-"/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/otu_table1.txt"
#biom_file <- '/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/otu_table1.biom'
output_qza='/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/otu_table1.qza'
metadata="/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/meta_table1.tsv"
output_flash='/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/flashnet.qza'
exported='/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/exported_flash'
neighbour='/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/neighbours1.qza'
truedaa="/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/truedaa.tsv"
simulated="/home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/otu_table1.tsv"
tr = pd.read_csv(truedaa,sep="\t", index_col=0)
otu_tab_sim=pd.read_csv(simulated, sep="\t", index_col=0)
# Define a function to run QIIME 2's ANCOM-BC with system calls
def run_flashweave(table_path, metadata_path, max_k, alpha, output_path):
    cmd = [
        "qiime", "makarsa", "flashweave",
        "--i-table", table_path,
        "--m-metadata-file", metadata_path,
        "--p-max-k", str(max_k),
        "--p-alpha", str(alpha),
        "--o-network", output_path
    ]
    subprocess.run(cmd, check=True)

# Define the parameter grid for tuning
param_grid = {
     'max_k': [0, 1, 2, 3, 4],
     'alpha' : [0.01, 0.02, 0.03, 0.04, 0.05]
     
}

# Convert parameter grid to list of dictionaries
param_list = list(ParameterGrid(param_grid))

# Placeholder for results
results = []
k=0
# Run tuning
for params in param_list:
    try:
        run_flashweave(
            table_path=output_qza,
            metadata_path=metadata,
            max_k=params['max_k'],
            alpha=params['alpha'],
            output_path=output_flash
        )
        cmd = [
            "qiime", "makarsa", "list-neighbours",
            "--i-network",output_flash,
            "--p-feature-id", "covariate",
            "--o-neighbours",neighbour
        ]
        subprocess.run(cmd, check=True)

        # Export the results from QIIME2
        subprocess.run([
            'qiime', 'tools', 'export',
            '--input-path', neighbour,
            '--output-path', exported
        ], check=True)

        # Construct file path for q_val_slice.csv
        metadata_tsv = os.path.join(exported, 'metadata.tsv')

        # Check if the file exists before reading
        if not os.path.exists(metadata_tsv):
            print(f"Expected metadata_tsv does not exist at path: {metadata_tsv}")
        else:
            # Read the q-value results
            #metadata_tsv = pd.read_csv(metadata_tsv, index_col=0)
            
            # Read the results
            neighbours = pd.read_csv(metadata_tsv, sep='\t', index_col=0)

            # Take only necessary data
            neighbours = neighbours.iloc[1:, ]

            # Assuming 'otu_tab_sim' is already defined as a DataFrame
            # Check if indices of 'otu_tab_sim' are in 'neighbours'
            e = otu_tab_sim.index.isin(neighbours.index)

            # Create a DataFrame for predictions
            pred = pd.DataFrame({'pred': ['DA' if i else 'NOT_DA' for i in e]}, index=otu_tab_sim.index)

            # Assuming 'tr' is already defined and is a DataFrame
            # Combine 'pred' and 'tr' into one DataFrame
            out_df = pd.concat([pred, tr], axis=1)

            # Calculate metrics
            out_df['truth'] = out_df['truth'].apply(lambda x: 1 if x == 'DA' else 0)
            out_df['pred'] = out_df['pred'].apply(lambda x: 1 if x == 'DA' else 0)
            FP = sum((out_df['truth'] == 0) & (out_df['pred'] == 1))
            FN = sum((out_df['truth'] == 1) & (out_df['pred'] == 0))
            TP = sum((out_df['truth'] == 1) & (out_df['pred'] == 1))
            TN = sum((out_df['truth'] == 0) & (out_df['pred'] == 0))

            precision = TP / (TP + FP) if TP + FP > 0 else None
            recall = TP / (TP + FN) if TP + FN > 0 else None
            fpr = FP / (FP + TN) if FP + TN > 0 else None
            #f1_score = 2 * (recall * precision) / (precision + recall) if precision + recall > 0 else None
            performance_score = (TP + TN) / (TP + TN + FP + FN)
             
            results.append((params, performance_score))
        k=k+1
        if(k>=10):
            break
    except subprocess.CalledProcessError as e:
        print(f"Error running flashweave: {e}")

# Convert results to DataFrame
results_df = pd.DataFrame(results, columns=['params', 'performance_score'])

# Extract the hyperparameters and scores
X = results_df['params'].apply(pd.Series)
y = results_df['performance_score']

# Fit the Decision Tree Regressor
regressor = DecisionTreeRegressor()
regressor.fit(X, y)

# Now you can analyze the regressor to find out which hyperparameters are important
print(regressor.feature_importances_)

# Predict with the regressor to find the best hyperparameters
best_params_index = np.argmax(regressor.predict(X))
best_params = X.iloc[best_params_index]
print(f"Best hyperparameters: {best_params.to_dict()}")

Saved Network to: /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/flashnet.qza
Saved ImmutableMetadata to: /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/neighbours1.qza
Exported /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/neighbours1.qza as ImmutableMetadataDirectoryFormat to directory /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/exported_flash
Saved Network to: /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/flashnet.qza
Saved ImmutableMetadata to: /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/neighbours1.qza
Exported /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/neighbours1.qza as ImmutableMetadataDirectoryFormat to directory /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/exported_flash
Saved Network to: /home/zakir/from_pendrive/PhD/DAA_benchmark_study/qiime2_benchmark/flashnet.qza
Saved ImmutableMetadata to: /home/zaki

In [13]:
#Best hyperparameters: {'alpha': 0.01, 'max_k': 0.0}