In [1]:
# Importing packages
import os
import yaml
import logging
import pandas as pd
from qiime2 import Artifact
from qiime2 import Visualization
from qiime2 import Metadata
import qiime2.plugins.dada2.actions as dada2_actions
import qiime2.plugins.metadata.actions as metadata_actions
from qiime2.plugins.feature_table.visualizers import tabulate_seqs
from qiime2.plugins.feature_table.visualizers import summarize
from qiime2.plugins.feature_table.visualizers import core_features

from qiime2.plugins.feature_table.methods import merge
from qiime2.plugins.feature_table.methods import merge_seqs
from qiime2.plugins.feature_table.methods import merge_taxa

from qiime2.plugins.feature_table.methods import filter_samples
from qiime2.plugins.feature_table.methods import filter_seqs

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
# Parameters
experiment_name = "thayane-PM"
base_dir = "/home/lauro/nupeb/rede-micro/redemicro-thayane"
manifest_file = "/home/lauro/nupeb/rede-micro/redemicro-thayane/data/manifest.csv"
metadata_file = "/home/lauro/nupeb/rede-micro/redemicro-thayane/data/metadata.tsv"
class_col = "sample-id"
classifier_file = "/home/lauro/nupeb/dados_brutos_rede_genoma/16S_classifiers_qiime2/silva-138-99-nb-classifier.qza"
replace_files = False
phred = 20
trunc_f = 0
trunc_r = 0
overlap = 12
threads = 6

In [3]:
# Define paths

experiment_folder = os.path.abspath(os.path.join(base_dir, 'experiments', experiment_name))
img_folder = os.path.abspath(os.path.join(experiment_folder, 'imgs'))
qiime_folder = os.path.join(experiment_folder, 'qiime-artifacts')

input_path = {'single-end': {}, 'paired-end': {}}

# Original Single-End files
se_metadata_path = "/home/lauro/nupeb/rede-micro/redemicro-thayane/data/metadata.tsv"
input_path['single-end']['tab'] = os.path.join(qiime_folder, 'dada2-tabs.qza')
input_path['single-end']['seq'] = os.path.join(qiime_folder, 'dada2-reps.qza')

# Original Paired-End files
pe_metadata_path = "/home/lauro/nupeb/dados_brutos_rede_genoma/metadata/metadata-feces.tsv"
pe_path = '/home/lauro/nupeb/dados_brutos_rede_genoma/qiime_analisys'
input_path['paired-end']['tab'] = os.path.join(pe_path, 'fezes_demux_paired_dada2_tabs.qza')
input_path['paired-end']['seq'] = os.path.join(pe_path, 'fezes_demux_paired_dada2_reps.qza')

In [4]:
# Load artifacts
se_metadata_artifact = Metadata.load(se_metadata_path)
pe_metadata_artifact = Metadata.load(pe_metadata_path)


input_artifact = {'single-end': {}, 'paired-end': {}}

# Original Single-End files
input_artifact['single-end']['tab'] = Artifact.load(input_path['single-end']['tab'])
input_artifact['single-end']['seq'] = Artifact.load(input_path['single-end']['seq'])

# Original Paired-End files
input_artifact['paired-end']['tab'] = Artifact.load(input_path['paired-end']['tab'])
input_artifact['paired-end']['seq'] = Artifact.load(input_path['paired-end']['seq'])


In [5]:
all_ids = set(input_artifact['single-end']['tab'].view(pd.DataFrame).index)

## Filter exclusively Single-End samples

In [6]:
# Select metadata rows
se_to_keep = ['M01', 'M03', 'M06', 'M09', 'M12', 'M19']
se_metadata = se_metadata_artifact.filter_ids(ids_to_keep=se_to_keep)
se_metadata.to_dataframe().index

Index(['M01', 'M03', 'M06', 'M09', 'M12', 'M19'], dtype='object', name='sample-id')

In [7]:
# Create filtered table
se_filtered_table = filter_samples(
    table = input_artifact['single-end']['tab'],
    metadata = se_metadata,
).filtered_table
se_filtered_table.view(pd.DataFrame).shape

(6, 231)

In [8]:
# Filter sequences for selected samples
se_filtered_seqs = filter_seqs(
    data = input_artifact['single-end']['seq'],
    table = se_filtered_table,
).filtered_bdata
se_filtered_seqs.view(pd.Series).shape

(231,)

## Filter post-menopause Paired-End sample

In [8]:
# Select metadata rows
pe_to_keep = all_ids - set(se_to_keep)
ind_pe = input_artifact['paired-end']['tab'].view(pd.DataFrame).index
pe_to_keep = [i for i in ind_pe if i.split('-')[1] in  pe_to_keep]
pe_metadata = pe_metadata_artifact.filter_ids(ids_to_keep=pe_to_keep)
pe_metadata.to_dataframe().shape

(38, 8)

In [16]:
# Filter feature table for selected samples
pe_filtered_table = filter_samples(
    table = input_artifact['paired-end']['tab'],
    metadata = pe_metadata,
).filtered_table
pe_filtered_table.view(pd.DataFrame).shape
# pe_filtered_table.view(pd.DataFrame).columns

(38, 1420)

In [41]:
# Filter sequences for selected samples
pe_filtered_seqs = filter_seqs(
    data = input_artifact['paired-end']['seq'],
    table = pe_filtered_table,
).filtered_data
pe_filtered_seqs.view(pd.Series).shape

(1420,)

## Merging Single-End and Paired-End tables

In [15]:
# Merge feature tables
merged_feature_table = merge(
    tables = [se_filtered_table, pe_filtered_table],
).merged_table
merged_feature_table.view(pd.DataFrame).shape

(44, 1651)

In [45]:
# Merge sequences
merged_sequences = merge_seqs(
    data = [se_filtered_seqs, pe_filtered_seqs]
).merged_data
merged_sequences.view(pd.Series).shape

(1651,)

In [57]:
df = merged_feature_table.view(pd.DataFrame)
new_idx = [i.split('-')[1] if i.startswith('I') else i for i in df.index]
df.index = new_idx
merged_feature_table = Artifact.import_data("FeatureTable[Frequency]", df)

## Persist merged data

In [58]:
# Replace table and sequences
merged_feature_table.save(input_path['single-end']['tab'])
merged_sequences.save(input_path['single-end']['seq'])

'/home/lauro/nupeb/rede-micro/redemicro-thayane/experiments/thayane-PM/qiime-artifacts/dada2-reps.qza'