# Prepare data for Pipeline

## Setup and settings

In [1]:
# Importing packages
import os
import yaml
import logging
import pandas as pd
from Bio.Seq import Seq
from qiime2 import Artifact, Visualization, Metadata
from qiime2.plugins import demux
from qiime2.plugins.cutadapt.methods import trim_paired, trim_single
from qiime2.plugins.demux.methods import filter_samples

### Receiving the parameters

The following cell can receive parameters using the [papermill](https://papermill.readthedocs.io/en/latest/) tool.

In [2]:
params_path = os.path.join('..', 'params', 'parameter-set-01.yaml')
experiment_name = 'exp-01'
base_dir = os.path.join('/', 'home', 'username', 'pipeline-dir')
manifest_file = os.path.join(base_dir, 'data', 'raw', 'manifest', 'manifest.csv')
replace_files = False
trim = None
metadata_file = None
threads = 1
demux_file = None

In [3]:
# Parameters
experiment_name = "ana-flavia-NCxSTD-NC-trim"
base_dir = "/home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri"
manifest_file = "/home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri/data/raw/manifest/manifest-ana-flavia-NCxSTD-NC.csv"
metadata_file = "/home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri/data/raw/metadata/metadata-ana-flavia-NCxSTD-NC.tsv"
class_col = "group-id"
classifier_file = "/home/lauro/nupeb/rede-micro/models/silva-138-99-nb-classifier.qza"
top_n = 20
replace_files = False
phred = 20
trunc_f = 0
trunc_r = 0
overlap = 12
threads = 6
trim = {
    "overlap": 8,
    "forward_primer": "CCTACGGGRSGCAGCAG",
    "reverse_primer": "GGACTACHVGGGTWTCTAAT",
}


In [4]:
out_dir = os.path.join(base_dir, 'experiments', experiment_name, 'qiime-artifacts')
img_folder = os.path.abspath(os.path.join(base_dir, 'experiments', experiment_name, 'imgs'))
data_dir =  os.path.abspath(os.path.join(base_dir, 'data'))
raw_data_dir =  os.path.abspath(os.path.join(data_dir, 'raw'))
interim_data_dir =  os.path.abspath(os.path.join(data_dir, 'interim'))

### Defining names and paths

In [5]:
# Create path if it not exists
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
    print(f'New qiime-artifacts folder path created: {out_dir}')

# Create path if it not exists
if not os.path.isdir(img_folder):
    os.makedirs(img_folder)
    print(f'New img folder path created: {img_folder}')

# Define the output artifact full path
if demux_file is None:
    demux_file = os.path.join(out_dir, 'demux-paired.qza')
demux_view = os.path.join(out_dir, 'demux-paired.qzv')
demux_file_trim = os.path.join(out_dir, 'demux-paired-trim.qza')
demux_view_trim = os.path.join(out_dir, 'demux-paired-trim.qzv')

New qiime-artifacts folder path created: /home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri/experiments/ana-flavia-NCxSTD-NC-trim/qiime-artifacts
New img folder path created: /home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri/experiments/ana-flavia-NCxSTD-NC-trim/imgs


## Step execution

This Step import all `fastq` files in a **QIIME2 Artifact** object and save it to a new `qza` file of `SampleData[PairedEndSequencesWithQuality]`

In [6]:
manifest_df = pd.read_csv(manifest_file)
n_directions = len(manifest_df['direction'].unique())
if n_directions == 1:
    d_type = 'SampleData[SequencesWithQuality]'
    v_type = 'SingleEndFastqManifestPhred33'
elif n_directions == 2:
    d_type = 'SampleData[PairedEndSequencesWithQuality]'
    v_type = 'PairedEndFastqManifestPhred33'
else:
    print(f'ERROR: invalid number of directions {n_directions}')

In [7]:
# If the metadata file is not defined, use the default metadata file
metadata_qa = Metadata.load(metadata_file)

In [8]:
# If the demux file does not exist or if the old file will be replaced
if not os.path.isfile(demux_file) or replace_files:
    interim_demux_path = os.path.join(interim_data_dir, "demux-paired.qza")
    # Load Artifact from interim folder
    if os.path.isfile(interim_demux_path):
        artifact = Artifact.load(interim_demux_path)
    # Create new Artifact using Manifest
    else:
        artifact = Artifact.import_data(d_type, manifest_file, view_type=v_type)
        if not os.path.isfile(interim_demux_path):
            artifact.save(interim_demux_path)
    # Filter and Save Artifact
    artifact = filter_samples(demux=artifact, metadata=metadata_qa).filtered_demux
    artifact.save(demux_file)
    # Visualization
    if os.path.isfile(demux_view):
        demux_view_obj = Visualization.load(demux_view)
    else:
        demux_view_obj = demux.visualizers.summarize(artifact).visualization
        Visualization.save(demux_view_obj, filepath=demux_view)
elif os.path.isfile(demux_file):
    # Load Artifact
    artifact = Artifact.load(demux_file)
    artifact = filter_samples(demux=artifact, metadata=metadata_qa).filtered_demux
    # Visualization
    if os.path.isfile(demux_view):
        demux_view_obj = Visualization.load(demux_view)
    else:
        demux_view_obj = demux.visualizers.summarize(artifact).visualization
        Visualization.save(demux_view_obj, filepath=demux_view)

## Step report


In [9]:
print(demux_view_obj)

<visualization: Visualization uuid: 1d38aade-9ada-471b-8b18-18496c99c5ec>


In [10]:
# Render Visualization
demux_view_obj

[cutadapt](https://docs.qiime2.org/2022.2/plugins/available/cutadapt/trim-paired/)

We trim the forward primer and the reverse complement of the reverse primer from the forward reads. We trim the reverse primer and reverse complement of the forward primer from the reverse reads.

In [11]:
demux_file_trim = os.path.join(out_dir, 'demux-paired-trim.qza')
demux_view_trim = os.path.join(out_dir, 'demux-paired-trim.qzv')

if trim and (not os.path.isfile(demux_file_trim) or replace_files):
    forward_primer = [trim['forward_primer']] # ['CCTACGGGRSGCAGCAG']
    reverse_primer = [trim['reverse_primer']] # ['GGACTACHVGGGTWTCTAAT']
    forward_reverse_complement = [str(Seq(forward_primer[0]).reverse_complement())]
    reverse_reverse_complement = [str(Seq(reverse_primer[0]).reverse_complement())]

    res = None
    
    if n_directions == 1:
        res = trim_single(
            demultiplexed_sequences=artifact,
            front=forward_primer,
            adapter=reverse_reverse_complement,
            cores=threads,
            indels=False,
            match_read_wildcards=True,
            match_adapter_wildcards=True,
            error_rate=0.01,
            discard_untrimmed=True,
        ).trimmed_sequences
    elif n_directions == 2:
        res = trim_paired(
            demultiplexed_sequences=artifact,
            front_f=forward_primer,
            front_r=reverse_primer,
            # adapter_f=reverse_reverse_complement,
            # adapter_r=forward_reverse_complement,
            cores=threads,
            overlap=trim['overlap'],
            indels=False,
            match_read_wildcards=True,
            match_adapter_wildcards=True,
            error_rate=0.01,
            discard_untrimmed=True,
        ).trimmed_sequences
    else:
        print(f'ERROR: invalid number of directions {n_directions}')

    if res is not None:
        res.save(demux_file_trim)
        Visualization.save(demux.visualizers.summarize(res).visualization, filepath=demux_view_trim)

Running external command line application. This may print messages to stdout and/or stderr.
The commands to be run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: cutadapt --cores 6 --error-rate 0.01 --times 1 --overlap 8 --minimum-length 1 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-8_hk7xjb/S210421121673_0_L001_R1_001.fastq.gz -p /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-8_hk7xjb/S210421121673_1_L001_R2_001.fastq.gz --front CCTACGGGRSGCAGCAG -G GGACTACHVGGGTWTCTAAT --no-indels --match-read-wildcards --discard-untrimmed /tmp/qiime2-archive-a3dl5c10/9bc974da-6c1a-46b4-848f-a51513cc37e7/data/S210421121673_0_L001_R1_001.fastq.gz /tmp/qiime2-archive-a3dl5c10/9bc974da-6c1a-46b4-848f-a51513cc37e7/data/S210421121673_1_L001_R2_001.fastq.gz



This is cutadapt 3.5 with Python 3.8.12
Command line parameters: --cores 6 --error-rate 0.01 --times 1 --overlap 8 --minimum-length 1 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-8_hk7xjb/S210421121673_0_L001_R1_001.fastq.gz -p /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-8_hk7xjb/S210421121673_1_L001_R2_001.fastq.gz --front CCTACGGGRSGCAGCAG -G GGACTACHVGGGTWTCTAAT --no-indels --match-read-wildcards --discard-untrimmed /tmp/qiime2-archive-a3dl5c10/9bc974da-6c1a-46b4-848f-a51513cc37e7/data/S210421121673_0_L001_R1_001.fastq.gz /tmp/qiime2-archive-a3dl5c10/9bc974da-6c1a-46b4-848f-a51513cc37e7/data/S210421121673_1_L001_R2_001.fastq.gz
Processing reads on 6 cores in paired-end mode ...


Finished in 3.77 s (10 µs/read; 5.89 M reads/minute).

=== Summary ===

Total read pairs processed:            370,042
  Read 1 with adapter:                 359,997 (97.3%)
  Read 2 with adapter:                 343,009 (92.7%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           36,262 (9.8%)
Pairs written (passing filters):       333,780 (90.2%)

Total basepairs processed:   188,721,420 bp
  Read 1:   112,862,810 bp
  Read 2:    75,858,610 bp
Total written (filtered):    155,609,062 bp (82.5%)
  Read 1:    94,978,565 bp
  Read 2:    60,630,497 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 359997 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	3	5.6	0	3
9	4	1.4	0	4
10	3	0.4	0	3
11	2	0.1	0	2
12	4	0.0	0	4
13	4	0.0	0	4
14	12	0.0	0	12
15	9	0.0	0	9
16	19	0.0	0	19
17	109	0.0	0	109
1

Finished in 5.08 s (9 µs/read; 6.61 M reads/minute).

=== Summary ===

Total read pairs processed:            559,807
  Read 1 with adapter:                 544,656 (97.3%)
  Read 2 with adapter:                 519,894 (92.9%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           53,686 (9.6%)
Pairs written (passing filters):       506,121 (90.4%)

Total basepairs processed:   285,501,570 bp
  Read 1:   170,741,135 bp
  Read 2:   114,760,435 bp
Total written (filtered):    235,957,214 bp (82.6%)
  Read 1:   144,019,667 bp
  Read 2:    91,937,547 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 544656 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	5	8.5	0	5
9	4	2.1	0	4
10	3	0.5	0	3
11	7	0.1	0	7
12	7	0.0	0	7
13	6	0.0	0	6
14	17	0.0	0	17
15	16	0.0	0	16
16	22	0.0	0	22
17	155	0.0	0	155


Finished in 3.35 s (10 µs/read; 6.19 M reads/minute).

=== Summary ===

Total read pairs processed:            345,762
  Read 1 with adapter:                 336,819 (97.4%)
  Read 2 with adapter:                 321,022 (92.8%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           32,941 (9.5%)
Pairs written (passing filters):       312,821 (90.5%)

Total basepairs processed:   176,338,620 bp
  Read 1:   105,457,410 bp
  Read 2:    70,881,210 bp
Total written (filtered):    145,841,282 bp (82.7%)
  Read 1:    89,015,404 bp
  Read 2:    56,825,878 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 336819 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	5	5.3	0	5
9	3	1.3	0	3
10	5	0.3	0	5
11	2	0.1	0	2
12	4	0.0	0	4
13	5	0.0	0	5
14	12	0.0	0	12
15	9	0.0	0	9
16	11	0.0	0	11
17	92	0.0	0	92
18	

Finished in 6.46 s (9 µs/read; 6.62 M reads/minute).

=== Summary ===

Total read pairs processed:            712,750
  Read 1 with adapter:                 694,205 (97.4%)
  Read 2 with adapter:                 664,033 (93.2%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           65,715 (9.2%)
Pairs written (passing filters):       647,035 (90.8%)

Total basepairs processed:   363,502,500 bp
  Read 1:   217,388,750 bp
  Read 2:   146,113,750 bp
Total written (filtered):    301,650,677 bp (83.0%)
  Read 1:   184,115,305 bp
  Read 2:   117,535,372 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 694205 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	4	10.9	0	4
9	9	2.7	0	9
10	5	0.7	0	5
11	14	0.2	0	14
12	8	0.0	0	8
13	6	0.0	0	6
14	30	0.0	0	30
15	18	0.0	0	18
16	33	0.0	0	33
17	183	0.0	0	1

Finished in 1.69 s (10 µs/read; 5.88 M reads/minute).

=== Summary ===

Total read pairs processed:            165,821
  Read 1 with adapter:                 161,411 (97.3%)
  Read 2 with adapter:                 154,014 (92.9%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           15,834 (9.5%)
Pairs written (passing filters):       149,987 (90.5%)

Total basepairs processed:    84,568,710 bp
  Read 1:    50,575,405 bp
  Read 2:    33,993,305 bp
Total written (filtered):     69,924,570 bp (82.7%)
  Read 1:    42,679,301 bp
  Read 2:    27,245,269 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 161411 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	3	2.5	0	3
9	1	0.6	0	1
10	3	0.2	0	3
11	4	0.0	0	4
12	1	0.0	0	1
13	3	0.0	0	3
14	8	0.0	0	8
15	1	0.0	0	1
16	4	0.0	0	4
17	46	0.0	0	46
18	2512

Finished in 1.56 s (10 µs/read; 5.74 M reads/minute).

=== Summary ===

Total read pairs processed:            149,412
  Read 1 with adapter:                 145,228 (97.2%)
  Read 2 with adapter:                 138,274 (92.5%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           14,953 (10.0%)
Pairs written (passing filters):       134,459 (90.0%)

Total basepairs processed:    76,200,120 bp
  Read 1:    45,570,660 bp
  Read 2:    30,629,460 bp
Total written (filtered):     62,685,460 bp (82.3%)
  Read 1:    38,260,442 bp
  Read 2:    24,425,018 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 145228 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	1	2.3	0	1
10	1	0.1	0	1
11	2	0.0	0	2
12	2	0.0	0	2
13	3	0.0	0	3
14	8	0.0	0	8
15	2	0.0	0	2
16	10	0.0	0	10
17	45	0.0	0	45
18	22753	0.0	0	2

Finished in 3.27 s (10 µs/read; 6.29 M reads/minute).

=== Summary ===

Total read pairs processed:            343,261
  Read 1 with adapter:                 334,370 (97.4%)
  Read 2 with adapter:                 318,364 (92.7%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           32,990 (9.6%)
Pairs written (passing filters):       310,271 (90.4%)

Total basepairs processed:   175,063,110 bp
  Read 1:   104,694,605 bp
  Read 2:    70,368,505 bp
Total written (filtered):    144,648,355 bp (82.6%)
  Read 1:    88,289,164 bp
  Read 2:    56,359,191 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 334370 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	1	5.2	0	1
9	5	1.3	0	5
10	2	0.3	0	2
11	5	0.1	0	5
12	2	0.0	0	2
13	1	0.0	0	1
14	11	0.0	0	11
15	9	0.0	0	9
16	13	0.0	0	13
17	90	0.0	0	90
18	

Finished in 0.72 s (12 µs/read; 4.90 M reads/minute).

=== Summary ===

Total read pairs processed:             58,992
  Read 1 with adapter:                  57,341 (97.2%)
  Read 2 with adapter:                  54,208 (91.9%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:            6,281 (10.6%)
Pairs written (passing filters):        52,711 (89.4%)

Total basepairs processed:    30,085,920 bp
  Read 1:    17,992,560 bp
  Read 2:    12,093,360 bp
Total written (filtered):     24,574,036 bp (81.7%)
  Read 1:    14,999,414 bp
  Read 2:     9,574,622 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 57341 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
9	1	0.2	0	1
11	1	0.0	0	1
12	1	0.0	0	1
14	1	0.0	0	1
15	6	0.0	0	6
16	3	0.0	0	3
17	10	0.0	0	10
18	8921	0.0	0	8921
19	10341	0.0	0	10341
20	10

Finished in 2.28 s (10 µs/read; 6.04 M reads/minute).

=== Summary ===

Total read pairs processed:            229,809
  Read 1 with adapter:                 223,555 (97.3%)
  Read 2 with adapter:                 212,081 (92.3%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           23,391 (10.2%)
Pairs written (passing filters):       206,418 (89.8%)

Total basepairs processed:   117,202,590 bp
  Read 1:    70,091,745 bp
  Read 2:    47,110,845 bp
Total written (filtered):     96,229,804 bp (82.1%)
  Read 1:    58,736,074 bp
  Read 2:    37,493,730 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 223555 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	2	3.5	0	2
9	4	0.9	0	4
10	3	0.2	0	3
11	6	0.1	0	6
12	6	0.0	0	6
13	2	0.0	0	2
14	14	0.0	0	14
15	8	0.0	0	8
16	6	0.0	0	6
17	64	0.0	0	64
18	3

Finished in 1.25 s (11 µs/read; 5.52 M reads/minute).

=== Summary ===

Total read pairs processed:            114,634
  Read 1 with adapter:                 111,413 (97.2%)
  Read 2 with adapter:                 104,800 (91.4%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           12,724 (11.1%)
Pairs written (passing filters):       101,910 (88.9%)

Total basepairs processed:    58,463,340 bp
  Read 1:    34,963,370 bp
  Read 2:    23,499,970 bp
Total written (filtered):     47,509,206 bp (81.3%)
  Read 1:    28,997,978 bp
  Read 2:    18,511,228 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 111413 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	1	1.7	0	1
10	4	0.1	0	4
11	5	0.0	0	5
12	1	0.0	0	1
13	1	0.0	0	1
14	5	0.0	0	5
15	5	0.0	0	5
16	3	0.0	0	3
17	33	0.0	0	33
18	17408	0.0	0	174

Finished in 0.67 s (14 µs/read; 4.41 M reads/minute).

=== Summary ===

Total read pairs processed:             49,335
  Read 1 with adapter:                  47,930 (97.2%)
  Read 2 with adapter:                  45,553 (92.3%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:            5,055 (10.2%)
Pairs written (passing filters):        44,280 (89.8%)

Total basepairs processed:    25,160,850 bp
  Read 1:    15,047,175 bp
  Read 2:    10,113,675 bp
Total written (filtered):     20,642,581 bp (82.0%)
  Read 1:    12,599,485 bp
  Read 2:     8,043,096 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 47930 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
9	1	0.2	0	1
11	2	0.0	0	2
12	1	0.0	0	1
14	1	0.0	0	1
15	2	0.0	0	2
16	3	0.0	0	3
17	16	0.0	0	16
18	7269	0.0	0	7269
19	8668	0.0	0	8668
20	8944