# Project setup

In [None]:
import os
import sys
import yaml
import tools.objects as objs
import tools.helpers as hlp

In [None]:
# Optional: list all configurations available to load with the 'config_path' argument below
hlp.list_persistent_configs()

In [None]:
# Initialize project (add the data processing and analysis hashes if resuming a previous project)
project = objs.Project(data_processing_hash=None, analysis_hash=None, overwrite=False)

# Dataset initiation

In [None]:
# Create dataset objects and gather raw metadata and data (already pre-obtained from JGI sources)
tx_dataset = objs.TX(project)
mx_dataset = objs.MX(project, last=True)

# Analysis initiation

In [None]:
# Create analysis object (collection of datasets and methods for performing integration)
analysis = objs.Analysis(project, datasets=[tx_dataset, mx_dataset])

# Save configuration

In [None]:
# Save persistent configuration and notebook files
project.save_persistent_config_and_notebook()

# Data processing

In [None]:
# Link analysis datasets by finding corresponding sample metadata fields
analysis.link_metadata()

In [None]:
# Link analysis datasets matrices by using linked metadata
analysis.link_data()

In [None]:
# Filter out rare features from analysis datasets based on minimum observed value or proportion of missing values across samples
analysis.filter_all_datasets()

In [None]:
# Filter out features from analysis datasets that were not impacted by experimentation based on low variance across samples
analysis.devariance_all_datasets()

In [None]:
# Scale features in all analysis datasets to a shared, normalized distribution
analysis.scale_all_datasets()

In [None]:
# Filter out features from analysis datasets based on low within-replicate reproducibility
analysis.replicability_test_all_datasets()

In [None]:
# Check data distributions after all normalization steps
analysis.plot_dataset_distributions()

In [None]:
# Check dimension reduction plots after data normalization steps
analysis.plot_pca_all_datasets()

# Integration analysis

In [None]:
# Integrate metadata tables by overlapping samples
analysis.integrate_metadata()

In [None]:
# Integrate data matrices by overlapping samples
analysis.integrate_data()

In [None]:
# Annotate the integrated features with pre-generated feature annotation tables
analysis.annotate_integrated_features()

In [None]:
# Subset features using statistical tests
analysis.perform_feature_selection()

In [None]:
# Calculate correlations with custom parameters
analysis.calculate_correlated_features()

In [None]:
# Plot correlation network
analysis.plot_correlation_network()

In [None]:
# Assess enrichment of an annotation layer in the extracted submodules
analysis.perform_functional_enrichment()

In [None]:
# Sync all results tables to database for query actions
analysis.register_all_existing_data()

In [None]:
# # Examples of natural language queries for data exploration
submod_request_ex = analysis.query("Show nodes in the network that are in submodule 1 and have any transcript annotation that is not 'Unassigned'")
display(submod_request_ex.head())
func_enr_request_ex = analysis.query("Find functions enriched in any submodule where corrected p value is less than 0.1")
display(func_enr_request_ex.head())
annot_request = analysis.query("Find all features that are annotated to a metabolite np classifier superclass, class, or subclass that includes the word 'polyamines'")
display(annot_request.head())

In [None]:
# Check abundance patterns of individual features
#analysis.plot_individual_feature('<feature_name>', '<metadata_category>')