In [1]:
# Importing packages
import os
import yaml
import logging
import pandas as pd
from qiime2 import Artifact
from qiime2 import Visualization
from qiime2 import Metadata
import qiime2.plugins.dada2.actions as dada2_actions
import qiime2.plugins.metadata.actions as metadata_actions
from qiime2.plugins.feature_table.visualizers import tabulate_seqs
from qiime2.plugins.feature_table.visualizers import summarize
from qiime2.plugins.feature_table.visualizers import core_features

from qiime2.plugins.feature_table.methods import merge
from qiime2.plugins.feature_table.methods import merge_seqs
from qiime2.plugins.feature_table.methods import merge_taxa

from qiime2.plugins.feature_table.methods import filter_samples
from qiime2.plugins.feature_table.methods import filter_seqs

import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
# Parameters
experiment_name = "thayane-PM-joined"
base_dir = "/mnt/nupeb/rede-micro/redemicro-thayane"
manifest_file = f"{base_dir}/data/manifest.csv"
metadata_file = f"{base_dir}/data/metadata.tsv"
class_col = "sample-id"
classifier_file = "/mnt/nupeb/rede-micro/datasets/16S_classifiers_qiime2/silva-138-99-nb-classifier.qza"
replace_files = False
phred = 20
trunc_f = 0
trunc_r = 0
overlap = 12
threads = 6

In [None]:
# Define paths
experiment_folder = os.path.abspath(os.path.join(base_dir, 'experiments', experiment_name))
data_folder = f'{base_dir}/data/'
img_folder = os.path.abspath(os.path.join(experiment_folder, 'imgs'))
qiime_folder = os.path.join(experiment_folder, 'qiime-artifacts')

input_path = {'single-end': {}, 'paired-end': {}}

for k in input_path.keys():
    input_path[k]['metadata'] = f"{base_dir}/data/{k}-metadata.tsv"
    qiime_folder = os.path.join(base_dir, 'experiments', f'thayane-PM-{k}', 'qiime-artifacts')
    input_path[k]['tab'] = os.path.join(qiime_folder, 'dada2-tabs.qza')
    input_path[k]['seq'] = os.path.join(qiime_folder, 'dada2-reps.qza')

print(input_path)

# # Original Single-End files
# se_metadata_path = "/home/lauro/nupeb/rede-micro/redemicro-thayane/data/single-end-metadata.tsv"
# input_path['single-end']['tab'] = os.path.join(qiime_folder, 'dada2-tabs.qza')
# input_path['single-end']['seq'] = os.path.join(qiime_folder, 'dada2-reps.qza')
#
# # Original Paired-End files
# pe_metadata_path = "/home/lauro/nupeb/rede-micro/redemicro-thayane/data/paired-end-metadata.tsv"
# pe_path = '/home/lauro/nupeb/rede-micro/redemicro-thayane/experiments/thayane-PM-single-end/'
# input_path['paired-end']['tab'] = os.path.join(pe_path, 'fezes_demux_paired_dada2_tabs.qza')
# input_path['paired-end']['seq'] = os.path.join(pe_path, 'fezes_demux_paired_dada2_reps.qza')

In [None]:
# Load artifacts
se_metadata_artifact = Metadata.load(input_path['single-end']['metadata'])
pe_metadata_artifact = Metadata.load(input_path['paired-end']['metadata'])
print('number of samples in single-end metadata:', se_metadata_artifact.to_dataframe().shape)
print('number of samples in paired-end metadata:', pe_metadata_artifact.to_dataframe().shape)


input_artifact = {'single-end': {}, 'paired-end': {}}

# Original Single-End files
input_artifact['single-end']['tab'] = Artifact.load(input_path['single-end']['tab'])
input_artifact['single-end']['seq'] = Artifact.load(input_path['single-end']['seq'])

# Original Paired-End files
input_artifact['paired-end']['tab'] = Artifact.load(input_path['paired-end']['tab'])
input_artifact['paired-end']['seq'] = Artifact.load(input_path['paired-end']['seq'])

In [None]:
all_ids = set(input_artifact['single-end']['tab'].view(pd.DataFrame).index)
all_ids |= set(input_artifact['paired-end']['tab'].view(pd.DataFrame).index)
print(f'lenght of all_ids: {len(all_ids)}')

## Filter exclusively Single-End samples

In [None]:
# Select metadata rows
se_to_keep = ['M01', 'M03', 'M06', 'M09', 'M12', 'M19']
se_metadata = se_metadata_artifact.filter_ids(ids_to_keep=se_to_keep)
se_metadata.to_dataframe().index

In [None]:
# Create filtered table
se_filtered_table = filter_samples(
    table = input_artifact['single-end']['tab'],
    metadata = se_metadata,
).filtered_table
se_filtered_table.view(pd.DataFrame).shape

In [None]:
# Filter sequences for selected samples
se_filtered_seqs = filter_seqs(
    data = input_artifact['single-end']['seq'],
    table = se_filtered_table,
).filtered_data
print(f'Number of sequences: {se_filtered_seqs.view(pd.Series).shape}')
print(f'{se_filtered_seqs.view(pd.Series).shape[0] / input_artifact["single-end"]["seq"].view(pd.Series).shape[0] * 100:.2f}% of sequences were kept')

## Filter post-menopause Paired-End sample

In [None]:

print(pe_metadata_artifact.to_dataframe().index)

In [None]:
# Select metadata rows
pe_to_keep = pe_metadata_artifact.to_dataframe().index
print(f'Number of samples: {len(pe_to_keep)}')
print(f'pe_to_keep: {pe_to_keep}')
pe_metadata = pe_metadata_artifact.filter_ids(ids_to_keep=pe_to_keep)
pe_metadata.to_dataframe().shape

In [None]:
# Filter feature table for selected samples
pe_filtered_table = filter_samples(
    table = input_artifact['paired-end']['tab'],
    metadata = pe_metadata,
).filtered_table
pe_filtered_table.view(pd.DataFrame).shape
# pe_filtered_table.view(pd.DataFrame).columns

In [None]:
# Filter sequences for selected samples
pe_filtered_seqs = filter_seqs(
    data = input_artifact['paired-end']['seq'],
    table = pe_filtered_table,
).filtered_data
pe_filtered_seqs.view(pd.Series).shape

## Merging Single-End and Paired-End tables

In [None]:
# Merge feature tables
merged_feature_table = merge(
    tables = [se_filtered_table, pe_filtered_table],
).merged_table
merged_feature_table.view(pd.DataFrame).shape

In [None]:
# Merge sequences
merged_sequences = merge_seqs(
    data = [se_filtered_seqs, pe_filtered_seqs]
).merged_data
merged_sequences.view(pd.Series).shape

In [None]:
df = merged_feature_table.view(pd.DataFrame)
new_idx = [i.split('-')[1] if i.startswith('I') else i for i in df.index]
df.index = new_idx
merged_feature_table = Artifact.import_data("FeatureTable[Frequency]", df)

## Merging Single-End and Paired-End metadata

In [None]:
# Get metadata rows
se_metadata = se_metadata_artifact.to_dataframe()
pe_metadata = pe_metadata_artifact.to_dataframe()
# Merge two dataframes by rows
metadata_df = pd.concat([se_metadata, pe_metadata])
# Convert metadtata to qiime2 artifact
metadata_qa = Metadata(metadata_df)
metadata_path = os.path.join(data_folder, 'merged-metadata.tsv')
metadata_qa.save(metadata_path)

## Persist merged data

In [None]:
# TODO: save merged data into a new location

# # Replace table and sequences
# merged_feature_table.save(input_path['single-end']['tab'])
# merged_sequences.save(input_path['single-end']['seq'])

In [None]:
# Create view and save frequency table
tabs_view = summarize(table=merged_feature_table, sample_metadata=metadata_qa).visualization
tabs_view.save(os.path.join(qiime_folder, 'abs-freq-tabs.qzv'))

In [None]:
# Create view and save representative sequences
reps_view = tabulate_seqs(data=merged_sequences).visualization
reps_view.save(os.path.join(qiime_folder, 'repr-seqs.qzv'))