# Prepare data for Pipeline

## Setup and settings

In [1]:
# Importing packages
import os
import yaml
import logging
import pandas as pd
from Bio.Seq import Seq
from qiime2 import Artifact, Visualization, Metadata
from qiime2.plugins import demux
from qiime2.plugins.cutadapt.methods import trim_paired, trim_single
from qiime2.plugins.demux.methods import filter_samples

### Receiving the parameters

The following cell can receive parameters using the [papermill](https://papermill.readthedocs.io/en/latest/) tool.

In [2]:
params_path = os.path.join('..', 'params', 'parameter-set-01.yaml')
experiment_name = 'exp-01'
base_dir = os.path.join('/', 'home', 'username', 'pipeline-dir')
manifest_file = os.path.join(base_dir, 'data', 'raw', 'manifest', 'manifest.csv')
replace_files = False
trim = None
metadata_file = None
threads = 1
demux_file = None

In [3]:
# Parameters
experiment_name = "thayane-PM-single-end-trim"
base_dir = "/mnt/nupeb/rede-micro/redemicro-thayane"
manifest_file = "/mnt/nupeb/rede-micro/redemicro-thayane/data/manifest-single.csv"
metadata_file = "/mnt/nupeb/rede-micro/redemicro-thayane/data/single-end-metadata.tsv"
class_col = "above_10"
classifier_file = "/mnt/nupeb/rede-micro/datasets/16S_classifiers_qiime2/silva-138-99-nb-classifier.qza"
replace_files = False
phred = 20
trunc_f = 0
trunc_r = 0
overlap = 12
threads = 6
trim = {
    "overlap": 8,
    "forward_primer": "CCTACGGGRSGCAGCAG",
    "reverse_primer": "GGACTACHVGGGTWTCTAAT",
}


In [4]:
out_dir = os.path.join(base_dir, 'experiments', experiment_name, 'qiime-artifacts')
img_folder = os.path.abspath(os.path.join(base_dir, 'experiments', experiment_name, 'imgs'))
data_dir =  os.path.abspath(os.path.join(base_dir, 'data'))
raw_data_dir =  os.path.abspath(os.path.join(data_dir, 'raw'))
interim_data_dir =  os.path.abspath(os.path.join(data_dir, 'interim'))

### Defining names and paths

In [5]:
# Create path if it not exists
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
    print(f'New qiime-artifacts folder path created: {out_dir}')

# Create path if it not exists
if not os.path.isdir(img_folder):
    os.makedirs(img_folder)
    print(f'New img folder path created: {img_folder}')

In [6]:
manifest_df = pd.read_csv(manifest_file)
n_directions = len(manifest_df['direction'].unique())
if n_directions == 1:
    d_type = 'SampleData[SequencesWithQuality]'
    v_type = 'SingleEndFastqManifestPhred33'
    direction = 'single'
elif n_directions == 2:
    d_type = 'SampleData[PairedEndSequencesWithQuality]'
    v_type = 'PairedEndFastqManifestPhred33'
    direction = 'paired'
else:
    print(f'ERROR: invalid number of directions {n_directions}')

In [7]:
sufix = str(direction)
if trim and len(trim.keys()) == 3:
    sufix += f'-{trim}'
    
# Define the output artifact full path
if demux_file is None:
    demux_file = os.path.join(out_dir, f'demux-{direction}.qza')
demux_view = os.path.join(out_dir, f'demux-{direction}.qzv')
demux_file_trim = os.path.join(out_dir, f'demux-{direction}-trim.qza')
demux_view_trim = os.path.join(out_dir, f'demux-{direction}-trim.qzv')

## Step execution

This Step import all `fastq` files in a **QIIME2 Artifact** object and save it to a new `qza` file.

In [8]:
# If the metadata file is not defined, use the default metadata file
metadata_qa = Metadata.load(metadata_file)

In [9]:
# If the demux file does not exist or if the old file will be replaced
if not os.path.isfile(demux_file) or replace_files:
    
    interim_demux_path = os.path.join(interim_data_dir, f'demux-{direction}.qza')
    
    # Create new file path for interim dir
    if not os.path.isdir(interim_data_dir):
        !mkdir -p {interim_data_dir}
        
    # Load Artifact from interim folder
    if os.path.isfile(interim_demux_path):
        artifact = Artifact.load(interim_demux_path)
        
    # Create new Artifact using Manifest
    else:
        artifact = Artifact.import_data(d_type, manifest_file, view_type=v_type)
        if not os.path.isfile(interim_demux_path):
            artifact.save(interim_demux_path)
            
    # Filter and Save Artifact
    artifact = filter_samples(demux=artifact, metadata=metadata_qa).filtered_demux
    artifact.save(demux_file)
    
    # Visualization
    if os.path.isfile(demux_view):
        demux_view_obj = Visualization.load(demux_view)
    else:
        demux_view_obj = demux.visualizers.summarize(artifact).visualization
        Visualization.save(demux_view_obj, filepath=demux_view)
        
elif os.path.isfile(demux_file):
    
    # Load Artifact
    artifact = Artifact.load(demux_file)
    artifact = filter_samples(demux=artifact, metadata=metadata_qa).filtered_demux
    
    # Visualization
    if os.path.isfile(demux_view):
        demux_view_obj = Visualization.load(demux_view)
    else:
        demux_view_obj = demux.visualizers.summarize(artifact).visualization
        Visualization.save(demux_view_obj, filepath=demux_view)

## Step report


In [10]:
print(demux_view_obj)

# Render Visualization
demux_view_obj

<visualization: Visualization uuid: e6a49c54-3c12-43b7-9056-0db2c08bb18f>


[cutadapt](https://docs.qiime2.org/2022.2/plugins/available/cutadapt/trim-paired/)

We trim the forward primer and the reverse complement of the reverse primer from the forward reads. We trim the reverse primer and reverse complement of the forward primer from the reverse reads.

In [11]:
demux_file_trim = os.path.join(out_dir, f'demux-{direction}-trim.qza')
demux_view_trim = os.path.join(out_dir, f'demux-{direction}-trim.qzv')

if trim and (not os.path.isfile(demux_file_trim) or replace_files):
    forward_primer = [trim['forward_primer']] # ['CCTACGGGRSGCAGCAG']
    reverse_primer = [trim['reverse_primer']] # ['GGACTACHVGGGTWTCTAAT']
    forward_reverse_complement = [str(Seq(forward_primer[0]).reverse_complement())]
    reverse_reverse_complement = [str(Seq(reverse_primer[0]).reverse_complement())]

    res = None
    
    if n_directions == 1:
        res = trim_single(
            demultiplexed_sequences=artifact,
            front=forward_primer,
            adapter=reverse_reverse_complement,
            cores=threads,
            indels=False,
            match_read_wildcards=True,
            match_adapter_wildcards=True,
            error_rate=0.01,
            discard_untrimmed=True,
        ).trimmed_sequences
    elif n_directions == 2:
        res = trim_paired(
            demultiplexed_sequences=artifact,
            front_f=forward_primer,
            front_r=reverse_primer,
            # adapter_f=reverse_reverse_complement,
            # adapter_r=forward_reverse_complement,
            cores=threads,
            overlap=trim['overlap'],
            indels=False,
            match_read_wildcards=True,
            match_adapter_wildcards=True,
            error_rate=0.01,
            discard_untrimmed=True,
        ).trimmed_sequences
    else:
        print(f'ERROR: invalid number of directions {n_directions}')

    if res is not None:
        res.save(demux_file_trim)
        Visualization.save(demux.visualizers.summarize(res).visualization, filepath=demux_view_trim)

Running external command line application. This may print messages to stdout and/or stderr.
The commands to be run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: cutadapt --cores 6 --error-rate 0.01 --times 1 --overlap 3 --minimum-length 1 -q 0,0 --quality-base 33 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-ydo0t00k/M01_0_L001_R1_001.fastq.gz --adapter ATTAGAWACCCBDGTAGTCC --front CCTACGGGRSGCAGCAG --no-indels --match-read-wildcards --discard-untrimmed /tmp/qiime2/lauro/data/eb996efa-66bc-4362-9227-c8b55e8fda78/data/M01_0_L001_R1_001.fastq.gz

This is cutadapt 4.4 with Python 3.8.16
Command line parameters: --cores 6 --error-rate 0.01 --times 1 --overlap 3 --minimum-length 1 -q 0,0 --quality-base 33 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-ydo0t00k/M01_0_L001_R1_001.fastq.gz --adapter ATTAGAWACCCBDGTAGTCC --front CCTACGGGRSGCAGCAG --no-indels --match-read-wildcards --discard-untrimmed /tmp/qiim

Finished in 0.193 s (2.337 µs/read; 25.68 M reads/minute).

=== Summary ===

Total reads processed:                  82,485
Reads with adapters:                         1 (0.0%)

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads discarded as untrimmed:           82,484 (100.0%)
Reads written (passing filters):             1 (0.0%)

Total basepairs processed:    33,734,965 bp
Quality-trimmed:                       0 bp (0.0%)
Total written (filtered):            396 bp (0.0%)

=== Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 0 times

=== Adapter 2 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 1 times

Minimum overlap: 3
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
3	1	1288.8	0	1

Command: cutadapt --cores 6 --error-rate 0.01 --times 1 --overlap 3 --minimum-length 1 -q 0,0 --quality-base 33 -o /tmp/q2-CasavaOneEightSingleLane

Finished in 0.158 s (2.407 µs/read; 24.93 M reads/minute).

=== Summary ===

Total reads processed:                  65,820
Reads with adapters:                         0 (0.0%)

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads discarded as untrimmed:           65,820 (100.0%)
Reads written (passing filters):             0 (0.0%)

Total basepairs processed:    27,269,491 bp
Quality-trimmed:                       0 bp (0.0%)
Total written (filtered):              0 bp (0.0%)

=== Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 0 times

=== Adapter 2 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 0 times

Command: cutadapt --cores 6 --error-rate 0.01 --times 1 --overlap 3 --minimum-length 1 -q 0,0 --quality-base 33 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-ydo0t00k/M06_2_L001_R1_001.fastq.gz --adapter ATTAGAWACCCBDGTAGTCC --front CCTACGGGRSGCAGCAG --no-indels --match-read-wildcar

Finished in 0.180 s (2.005 µs/read; 29.93 M reads/minute).

=== Summary ===

Total reads processed:                  89,788
Reads with adapters:                         2 (0.0%)

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads discarded as untrimmed:           89,786 (100.0%)
Reads written (passing filters):             2 (0.0%)

Total basepairs processed:    36,861,392 bp
Quality-trimmed:                       0 bp (0.0%)
Total written (filtered):            742 bp (0.0%)

=== Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 0 times

=== Adapter 2 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 2 times

Minimum overlap: 3
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
3	2	1402.9	0	2

Command: cutadapt --cores 6 --error-rate 0.01 --times 1 --overlap 3 --minimum-length 1 -q 0,0 --quality-base 33 -o /tmp/q2-CasavaOneEightSingleLane

Finished in 0.191 s (2.288 µs/read; 26.23 M reads/minute).

=== Summary ===

Total reads processed:                  83,685
Reads with adapters:                         2 (0.0%)

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads discarded as untrimmed:           83,683 (100.0%)
Reads written (passing filters):             2 (0.0%)

Total basepairs processed:    34,329,999 bp
Quality-trimmed:                       0 bp (0.0%)
Total written (filtered):            807 bp (0.0%)

=== Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 1 times

Minimum overlap: 3
No. of allowed errors:
1-20 bp: 0

Bases preceding removed adapters:
  A: 0.0%
  C: 0.0%
  G: 100.0%
  T: 0.0%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
3	1	1307.6	0	1


=== Adapter 2 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 1 times

Minimum overlap: 3
No. of allowed errors:
1-17 bp:

Finished in 0.156 s (2.320 µs/read; 25.86 M reads/minute).

=== Summary ===

Total reads processed:                  67,446
Reads with adapters:                         1 (0.0%)

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads discarded as untrimmed:           67,445 (100.0%)
Reads written (passing filters):             1 (0.0%)

Total basepairs processed:    27,525,749 bp
Quality-trimmed:                       0 bp (0.0%)
Total written (filtered):            404 bp (0.0%)

=== Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 0 times

=== Adapter 2 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 1 times

Minimum overlap: 3
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
3	1	1053.8	0	1

Command: cutadapt --cores 6 --error-rate 0.01 --times 1 --overlap 3 --minimum-length 1 -q 0,0 --quality-base 33 -o /tmp/q2-CasavaOneEightSingleLane

Finished in 0.196 s (2.331 µs/read; 25.74 M reads/minute).

=== Summary ===

Total reads processed:                  83,879
Reads with adapters:                         4 (0.0%)

== Read fate breakdown ==
Reads that were too short:                   0 (0.0%)
Reads discarded as untrimmed:           83,875 (100.0%)
Reads written (passing filters):             4 (0.0%)

Total basepairs processed:    34,967,580 bp
Quality-trimmed:                       0 bp (0.0%)
Total written (filtered):          1,352 bp (0.0%)

=== Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 0 times

=== Adapter 2 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 4 times

Minimum overlap: 3
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
3	4	1310.6	0	4


  context['result_data'] = context['result_data'].append(df)
