# Prepare data for Pipeline

## Setup and settings

In [1]:
# Importing packages
import os
import yaml
import logging
import pandas as pd
from Bio.Seq import Seq
from qiime2 import Artifact, Visualization
from qiime2.plugins import demux
from qiime2.plugins.cutadapt.methods import trim_paired

### Receiving the parameters

The following cell can receive parameters using the [papermill](https://papermill.readthedocs.io/en/latest/) tool.

In [2]:
params_path = os.path.join('..', 'params', 'ana-flavia-superlactacao.yaml')
experiment_name = 'jenneffer-vs-01'
base_dir = os.path.join('/', 'home', 'lauro', 'nupeb', 'redemicro')
manifest_file = os.path.join(base_dir, 'data', 'raw', 'manifest', 'not-hist-vs-manifest.csv')
img_folder = os.path.abspath(os.path.join(base_dir, 'imgs'))
replace_files = False
trim = None

In [3]:
# Parameters
experiment_name = "miliane-CxAC-trim"
base_dir = "/mnt/nupeb/rede-micro/redemicro-miliane-nutri"
manifest_file = "/mnt/nupeb/rede-micro/redemicro-miliane-nutri/data/raw/manifest/miliane-manifest-CxAC.csv"
metadata_file = "/mnt/nupeb/rede-micro/redemicro-miliane-nutri/data/raw/metadata/miliane-metadata-CxAC.tsv"
class_col = "group-id"
classifier_file = (
    "/mnt/nupeb/rede-micro/qiime2-classifiers/silva-138-99-nb-classifier.qza"
)
replace_files = False
phred = 20
trunc_f = 0
trunc_r = 0
overlap = 12
threads = 6
trim = {
    "overlap": 8,
    "forward_primer": "CCTACGGGRSGCAGCAG",
    "reverse_primer": "GGACTACHVGGGTWTCTAAT",
}


### Defining names and paths

In [4]:
# Define the output folder path
out_dir = os.path.join(base_dir, 'experiments', experiment_name, 'qiime-artifacts')
img_folder = os.path.abspath(os.path.join(base_dir, 'experiments', experiment_name, 'imgs'))

# Create path if it not exists
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
    print(f'New qiime-artifacts folder path created: {out_dir}')

# Create path if it not exists
if not os.path.isdir(img_folder):
    os.makedirs(img_folder)
    print(f'New img folder path created: {img_folder}')

# Define the output artifact full path
demux_file = os.path.join(out_dir, 'demux-paired.qza')
demux_view = os.path.join(out_dir, 'demux-paired.qzv')
demux_file_trim = os.path.join(out_dir, 'demux-paired-trim.qza')
demux_view_trim = os.path.join(out_dir, 'demux-paired-trim.qzv')

## Step execution

This Step import all `fastq` files in a **QIIME2 Artifact** object and save it to a new `qza` file of `SampleData[PairedEndSequencesWithQuality]`

In [5]:
manifest_df = pd.read_csv(manifest_file)
n_directions = len(manifest_df['direction'].unique())
if n_directions == 1:
    d_type = 'SampleData[SequencesWithQuality]'
    v_type = 'SingleEndFastqManifestPhred33'
elif n_directions == 2:
    d_type = 'SampleData[PairedEndSequencesWithQuality]'
    v_type = 'PairedEndFastqManifestPhred33'
else:
    print(f'ERROR: invalid number of directions {n_directions}')

In [6]:
# If the demux file does not exist or if the old file will be replaced
if not os.path.isfile(demux_file) or replace_files:
    
    # Import data and create an artifact object
    artifact = Artifact.import_data(
        d_type, 
        manifest_file, 
        view_type=v_type)
    
    # Save the artifact object to a new qza file
    artifact.save(demux_file)

else:
    artifact = Artifact.load(demux_file)

In [7]:
if not os.path.isfile(demux_view) or replace_files: 
    # Generate e visualization of the Artifact
    demux_summary = demux.visualizers.summarize(artifact)

    # Save a new visualization file based on the qza file
    Visualization.save(demux_summary.visualization, filepath=demux_view)
    
    demux_view_obj = demux_summary.visualization
else:
    demux_view_obj = Visualization.load(demux_view)

## Step report


In [8]:
print(demux_view_obj)

<visualization: Visualization uuid: f8382eb5-41f8-4188-a086-5ae23bb70390>


In [9]:
# Render Visualization
demux_view_obj

[cutadapt](https://docs.qiime2.org/2022.2/plugins/available/cutadapt/trim-paired/)

We trim the forward primer and the reverse complement of the reverse primer from the forward reads. We trim the reverse primer and reverse complement of the forward primer from the reverse reads.

In [10]:
demux_file_trim = os.path.join(out_dir, 'demux-paired-trim.qza')
demux_view_trim = os.path.join(out_dir, 'demux-paired-trim.qzv')

if trim and (not os.path.isfile(demux_file_trim) or replace_files):
    forward_primer = [trim['forward_primer']] # ['CCTACGGGRSGCAGCAG']
    reverse_primer = [trim['reverse_primer']] # ['GGACTACHVGGGTWTCTAAT']
    forward_reverse_complement = [str(Seq(forward_primer[0]).reverse_complement())]
    reverse_reverse_complement = [str(Seq(reverse_primer[0]).reverse_complement())]

    res = trim_paired(
        demultiplexed_sequences=artifact,
        front_f=forward_primer,
        front_r=reverse_primer,
#         adapter_f=reverse_reverse_complement,
#         adapter_r=forward_reverse_complement,
        cores=threads,
        overlap=trim['overlap'],
        indels=False,
        match_read_wildcards=True,
        match_adapter_wildcards=True,
        error_rate=0.01,
        discard_untrimmed=True,
    ).trimmed_sequences

    demux.visualizers.summarize(res).visualization
    res.save(demux_file_trim)
    Visualization.save(demux.visualizers.summarize(res).visualization, filepath=demux_view_trim)

Running external command line application. This may print messages to stdout and/or stderr.
The commands to be run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: cutadapt --cores 6 --error-rate 0.01 --times 1 --overlap 8 --minimum-length 1 -q 0,0 --quality-base 33 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-w6ibsml7/210421121711_0_L001_R1_001.fastq.gz -p /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-w6ibsml7/210421121711_1_L001_R2_001.fastq.gz --front CCTACGGGRSGCAGCAG -G GGACTACHVGGGTWTCTAAT --no-indels --match-read-wildcards --discard-untrimmed /tmp/qiime2/lauro/data/fa2ef6ea-dacc-44dd-8ed2-358980c9d77e/data/210421121711_0_L001_R1_001.fastq.gz /tmp/qiime2/lauro/data/fa2ef6ea-dacc-44dd-8ed2-358980c9d77e/data/210421121711_1_L001_R2_001.fastq.gz

This is cutadapt 4.4 with Python 3.8.16
Command line parameters: --cores 6 --error-rate 0.01 --times 1 --overlap 8 --minimum-length 1 -q 0,0 --quality-base 33 

Finished in 0.280 s (5.625 µs/read; 10.67 M reads/minute).

=== Summary ===

Total read pairs processed:             49,793
  Read 1 with adapter:                  48,368 (97.1%)
  Read 2 with adapter:                  45,264 (90.9%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:            5,807 (11.7%)
Pairs written (passing filters):        43,986 (88.3%)

Total basepairs processed:    25,394,430 bp
  Read 1:    15,186,865 bp
  Read 2:    10,207,565 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):     20,506,528 bp (80.8%)
  Read 1:    12,516,130 bp
  Read 2:     7,990,398 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 48368 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	1	0.8	0	1
11	2	0.0	0	2
12

Finished in 0.815 s (4.134 µs/read; 14.51 M reads/minute).

=== Summary ===

Total read pairs processed:            197,224
  Read 1 with adapter:                 191,943 (97.3%)
  Read 2 with adapter:                 181,981 (92.3%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           19,971 (10.1%)
Pairs written (passing filters):       177,253 (89.9%)

Total basepairs processed:   100,584,240 bp
  Read 1:    60,153,320 bp
  Read 2:    40,430,920 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):     82,634,027 bp (82.2%)
  Read 1:    50,437,083 bp
  Read 2:    32,196,944 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 191943 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
10	3	0.2	0	3
11	2	0.0	0	2


Finished in 0.721 s (4.219 µs/read; 14.22 M reads/minute).

=== Summary ===

Total read pairs processed:            170,758
  Read 1 with adapter:                 165,742 (97.1%)
  Read 2 with adapter:                 156,948 (91.9%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           18,349 (10.7%)
Pairs written (passing filters):       152,409 (89.3%)

Total basepairs processed:    87,086,580 bp
  Read 1:    52,081,190 bp
  Read 2:    35,005,390 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):     71,052,054 bp (81.6%)
  Read 1:    43,367,422 bp
  Read 2:    27,684,632 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 165742 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	2	2.6	0	2
9	3	0.7	0	3
10

Finished in 1.175 s (3.793 µs/read; 15.82 M reads/minute).

=== Summary ===

Total read pairs processed:            309,657
  Read 1 with adapter:                 301,273 (97.3%)
  Read 2 with adapter:                 284,385 (91.8%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           32,799 (10.6%)
Pairs written (passing filters):       276,858 (89.4%)

Total basepairs processed:   157,925,070 bp
  Read 1:    94,445,385 bp
  Read 2:    63,479,685 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):    129,071,295 bp (81.7%)
  Read 1:    78,780,305 bp
  Read 2:    50,290,990 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 301273 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	2	4.7	0	2
9	6	1.2	0	6
10

Finished in 1.565 s (3.758 µs/read; 15.97 M reads/minute).

=== Summary ===

Total read pairs processed:            416,343
  Read 1 with adapter:                 405,322 (97.4%)
  Read 2 with adapter:                 382,754 (91.9%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           43,507 (10.4%)
Pairs written (passing filters):       372,836 (89.6%)

Total basepairs processed:   212,334,930 bp
  Read 1:   126,984,615 bp
  Read 2:    85,350,315 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):    173,816,084 bp (81.9%)
  Read 1:   106,092,368 bp
  Read 2:    67,723,716 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 405322 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	2	6.4	0	2
9	4	1.6	0	4
10

Finished in 0.918 s (4.158 µs/read; 14.43 M reads/minute).

=== Summary ===

Total read pairs processed:            220,699
  Read 1 with adapter:                 214,683 (97.3%)
  Read 2 with adapter:                 202,229 (91.6%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           23,902 (10.8%)
Pairs written (passing filters):       196,797 (89.2%)

Total basepairs processed:   112,556,490 bp
  Read 1:    67,313,195 bp
  Read 2:    45,243,295 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):     91,746,847 bp (81.5%)
  Read 1:    55,999,931 bp
  Read 2:    35,746,916 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 214683 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
9	1	0.8	0	1
10	2	0.2	0	2
1

Finished in 0.575 s (5.113 µs/read; 11.73 M reads/minute).

=== Summary ===

Total read pairs processed:            112,481
  Read 1 with adapter:                 109,357 (97.2%)
  Read 2 with adapter:                 102,843 (91.4%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           12,454 (11.1%)
Pairs written (passing filters):       100,027 (88.9%)

Total basepairs processed:    57,365,310 bp
  Read 1:    34,306,705 bp
  Read 2:    23,058,605 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):     46,631,078 bp (81.3%)
  Read 1:    28,461,891 bp
  Read 2:    18,169,187 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 109357 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	1	1.7	0	1
9	1	0.4	0	1
10

Finished in 0.486 s (4.318 µs/read; 13.90 M reads/minute).

=== Summary ===

Total read pairs processed:            112,618
  Read 1 with adapter:                 109,503 (97.2%)
  Read 2 with adapter:                 102,994 (91.5%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           12,401 (11.0%)
Pairs written (passing filters):       100,217 (89.0%)

Total basepairs processed:    57,435,180 bp
  Read 1:    34,348,490 bp
  Read 2:    23,086,690 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):     46,718,895 bp (81.3%)
  Read 1:    28,515,186 bp
  Read 2:    18,203,709 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 109503 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
10	1	0.1	0	1
11	2	0.0	0	2


Finished in 0.588 s (4.275 µs/read; 14.04 M reads/minute).

=== Summary ===

Total read pairs processed:            137,450
  Read 1 with adapter:                 133,616 (97.2%)
  Read 2 with adapter:                 125,984 (91.7%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           14,914 (10.9%)
Pairs written (passing filters):       122,536 (89.1%)

Total basepairs processed:    70,099,500 bp
  Read 1:    41,922,250 bp
  Read 2:    28,177,250 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):     57,126,283 bp (81.5%)
  Read 1:    34,867,786 bp
  Read 2:    22,258,497 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 133616 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	2	2.1	0	2
9	1	0.5	0	1
10

Finished in 1.449 s (3.897 µs/read; 15.40 M reads/minute).

=== Summary ===

Total read pairs processed:            371,795
  Read 1 with adapter:                 361,865 (97.3%)
  Read 2 with adapter:                 340,718 (91.6%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           40,027 (10.8%)
Pairs written (passing filters):       331,768 (89.2%)

Total basepairs processed:   189,615,450 bp
  Read 1:   113,397,475 bp
  Read 2:    76,217,975 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):    154,669,634 bp (81.6%)
  Read 1:    94,405,636 bp
  Read 2:    60,263,998 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 361865 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	1	5.7	0	1
9	5	1.4	0	5
10

Finished in 1.377 s (3.985 µs/read; 15.06 M reads/minute).

=== Summary ===

Total read pairs processed:            345,448
  Read 1 with adapter:                 336,214 (97.3%)
  Read 2 with adapter:                 317,650 (92.0%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           36,167 (10.5%)
Pairs written (passing filters):       309,281 (89.5%)

Total basepairs processed:   176,178,480 bp
  Read 1:   105,361,640 bp
  Read 2:    70,816,840 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):    144,189,236 bp (81.8%)
  Read 1:    88,006,094 bp
  Read 2:    56,183,142 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 336214 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
8	1	5.3	0	1
9	3	1.3	0	3
10

Finished in 0.504 s (4.207 µs/read; 14.26 M reads/minute).

=== Summary ===

Total read pairs processed:            119,832
  Read 1 with adapter:                 116,476 (97.2%)
  Read 2 with adapter:                 110,758 (92.4%)

== Read fate breakdown ==
Pairs that were too short:                   0 (0.0%)
Pairs discarded as untrimmed:           12,111 (10.1%)
Pairs written (passing filters):       107,721 (89.9%)

Total basepairs processed:    61,114,320 bp
  Read 1:    36,548,760 bp
  Read 2:    24,565,560 bp
Quality-trimmed:                       0 bp (0.0%)
  Read 1:             0 bp
  Read 2:             0 bp
Total written (filtered):     50,218,813 bp (82.2%)
  Read 1:    30,651,862 bp
  Read 2:    19,566,951 bp

=== First read: Adapter 1 ===

Sequence: CCTACGGGRSGCAGCAG; Type: regular 5'; Length: 17; Trimmed: 116476 times

Minimum overlap: 8
No. of allowed errors:
1-17 bp: 0

Overview of removed sequences
length	count	expect	max.err	error counts
9	2	0.5	0	2
11	3	0.0	0	3
1

  context['result_data'] = context['result_data'].append(df)


  context['result_data'] = context['result_data'].append(df)


  context['result_data'] = context['result_data'].append(df)


  context['result_data'] = context['result_data'].append(df)
