# Prepare data for Pipeline

## Setup and settings

In [1]:
# Importing packages
import os
import yaml
import logging
from Bio.Seq import Seq
from qiime2 import Artifact, Visualization
from qiime2.plugins import demux
from qiime2.plugins.cutadapt.methods import trim_paired

from utils import *

### Receiving the parameters

The following cell can receive parameters using the [papermill](https://papermill.readthedocs.io/en/latest/) tool.

In [2]:
params_path = os.path.join('..', 'params', 'ana-flavia-superlactacao.yaml')
experiment_name = 'jenneffer-vs-01'
base_dir = os.path.join('/', 'home', 'lauro', 'nupeb', 'redemicro')
manifest_file = os.path.join(base_dir, 'data', 'raw', 'manifest', 'not-hist-vs-manifest.csv')
img_folder = os.path.abspath(os.path.join(base_dir, 'imgs'))
replace_files = False
trim = None

In [3]:
# Parameters
experiment_name = "ana-flavia-NCxNR-trim"
base_dir = "/home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri"
manifest_file = "/home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri/data/raw/manifest/manifest-ana-flavia-NCxNR.csv"
metadata_file = "/home/lauro/nupeb/rede-micro/redemicro-ana-flavia-nutri/data/raw/metadata/metadata-ana-flavia-NCxNR.tsv"
class_col = "group-id"
classifier_file = "/home/lauro/nupeb/dados_brutos_rede_genoma/16S_classifiers_qiime2/silva-138-99-nb-classifier.qza"
replace_files = False
phred = 20
trunc_f = 0
trunc_r = 0
overlap = 12
threads = 6
trim = {
    "overlap": 8,
    "forward_primer": "CCTACGGGRSGCAGCAG",
    "reverse_primer": "GGACTACHVGGGTWTCTAAT",
}


In [4]:
# Setting hardcoded parameters for development and debbuging
production = True
if not production:
    params_path = os.path.join('..', 'params', 'ana-flavia-hipotese-01')
    with open(params_path, 'r') as stream:
        params = yaml.safe_load(stream)
        experiment_name = params['experiment_name']
        base_dir = params['base_dir']
        manifest_file = params['manifest_file']
        replace_files = params['replace_files']

### Defining names and paths

In [5]:
# new_manifest = '/home/lauro/nupeb/redemicro/data/raw/manifest/karina-manifest.csv'
# with open(manifest_file, 'r') as oldm, open(new_manifest, 'w') as newm:
#     header = 'sample-id,absolute-filepath,direction\n'
#     newm.write(header)
#     for line in oldm.readlines()[1:]:
#         sid, forward, reverse = line[:-1].split(',')
#         fline = ','.join((sid, forward, 'forward')) + '\n'
#         rline = ','.join((sid, reverse, 'reverse')) + '\n'
#         newm.write(fline)
#         newm.write(rline)

In [6]:
# Define the output folder path
out_dir = os.path.join(base_dir, 'experiments', experiment_name, 'qiime-artifacts')
img_folder = os.path.abspath(os.path.join(base_dir, 'experiments', experiment_name, 'imgs'))

# Create path if it not exist
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
    print(f'New qiime-artifacts folder path created: {out_dir}')

# Create path if it not exist
if not os.path.isdir(img_folder):
    os.makedirs(img_folder)
    print(f'New img folder path created: {img_folder}')

# Define the output artifact full path
demux_file = os.path.join(out_dir, 'demux-paired.qza')
demux_view = os.path.join(out_dir, 'demux-paired.qzv')
demux_file_trim = os.path.join(out_dir, 'demux-paired-trim.qza')
demux_view_trim = os.path.join(out_dir, 'demux-paired-trim.qzv')

## Step execution

This Step import all `fastq` files in a **QIIME2 Artifact** object and save it to a new `qza` file of `SampleData[PairedEndSequencesWithQuality]`

In [7]:
# If the demux file does not exist or if the old file will be replaced
if not os.path.isfile(demux_file) or replace_files:
    
    # Import data and create an artifact object
    artifact = Artifact.import_data(
        'SampleData[PairedEndSequencesWithQuality]', 
        manifest_file, 
        view_type='PairedEndFastqManifestPhred33')
    
    # Save the artifact object to a new qza file
    artifact.save(demux_file)

else:
    artifact = Artifact.load(demux_file)

In [8]:
if not os.path.isfile(demux_view) or replace_files: 
    # Generate e visualization of the Artifact
    demux_summary = demux.visualizers.summarize(artifact)

    # Save a new visualization file based on the qza file
    Visualization.save(demux_summary.visualization, filepath=demux_view)
    
    demux_view_obj = demux_summary.visualization
else:
    demux_view_obj = Visualization.load(demux_view)

## Step report


In [9]:
print(demux_view_obj)

<visualization: Visualization uuid: 64b9840f-fbb2-40f6-9010-834cea5a6ed5>


In [10]:
# Render Visualization
demux_view_obj

[cutadapt](https://docs.qiime2.org/2022.2/plugins/available/cutadapt/trim-paired/)

We trim the forward primer and the reverse complement of the reverse primer from the forward reads. We trim the reverse primer and reverse complement of the forward primer from the reverse reads.

In [11]:
if trim:    
    forward_primer = [trim['forward_primer']] # ['CCTACGGGRSGCAGCAG']
    reverse_primer = [trim['reverse_primer']] # ['GGACTACHVGGGTWTCTAAT']
    forward_reverse_complement = [str(Seq(forward_primer[0]).reverse_complement())]
    reverse_reverse_complement = [str(Seq(reverse_primer[0]).reverse_complement())]

    res = trim_paired(
        demultiplexed_sequences=artifact,
        front_f=forward_primer,
        front_r=reverse_primer,
        adapter_f=reverse_reverse_complement,
        adapter_r=forward_reverse_complement,
        cores=threads,
        overlap=trim['overlap'],
        indels=False,
        match_read_wildcards=True,
        match_adapter_wildcards=True,
        error_rate=0.15,
        discard_untrimmed=True,
    ).trimmed_sequences

    demux_file_trim = os.path.join(out_dir, 'demux-paired-trim.qza')
    demux_view_trim = os.path.join(out_dir, 'demux-paired-trim.qzv')
    
    demux.visualizers.summarize(res).visualization
    res.save(demux_file_trim)
    Visualization.save(demux.visualizers.summarize(res).visualization, filepath=demux_view_trim)

Running external command line application. This may print messages to stdout and/or stderr.
The commands to be run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: cutadapt --cores 6 --error-rate 0.15 --times 1 --overlap 8 --minimum-length 1 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-4urj2e_f/210421121673_0_L001_R1_001.fastq.gz -p /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-4urj2e_f/210421121673_1_L001_R2_001.fastq.gz --adapter ATTAGAWACCCBDGTAGTCC --front CCTACGGGRSGCAGCAG -A CTGCTGCSYCCCGTAGG -G GGACTACHVGGGTWTCTAAT --no-indels --match-read-wildcards --discard-untrimmed /tmp/qiime2-archive-hmux4h22/89488938-d969-489e-a3fa-6dd23632cd28/data/210421121673_0_L001_R1_001.fastq.gz /tmp/qiime2-archive-hmux4h22/89488938-d969-489e-a3fa-6dd23632cd28/data/210421121673_1_L001_R2_001.fastq.gz



This is cutadapt 3.5 with Python 3.8.12
Command line parameters: --cores 6 --error-rate 0.15 --times 1 --overlap 8 --minimum-length 1 -o /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-4urj2e_f/210421121673_0_L001_R1_001.fastq.gz -p /tmp/q2-CasavaOneEightSingleLanePerSampleDirFmt-4urj2e_f/210421121673_1_L001_R2_001.fastq.gz --adapter ATTAGAWACCCBDGTAGTCC --front CCTACGGGRSGCAGCAG -A CTGCTGCSYCCCGTAGG -G GGACTACHVGGGTWTCTAAT --no-indels --match-read-wildcards --discard-untrimmed /tmp/qiime2-archive-hmux4h22/89488938-d969-489e-a3fa-6dd23632cd28/data/210421121673_0_L001_R1_001.fastq.gz /tmp/qiime2-archive-hmux4h22/89488938-d969-489e-a3fa-6dd23632cd28/data/210421121673_1_L001_R2_001.fastq.gz
Processing reads on 6 cores in paired-end mode ...


Finished in 5.47 s (15 µs/read; 4.06 M reads/minute).

=== Summary ===

Total read pairs processed:            370,042
  Read 1 with adapter:                 366,460 (99.0%)
  Read 2 with adapter:                 362,566 (98.0%)

== Read fate breakdown ==
Pairs that were too short:                  10 (0.0%)
Pairs discarded as untrimmed:           10,985 (3.0%)
Pairs written (passing filters):       359,047 (97.0%)

Total basepairs processed:   188,721,420 bp
  Read 1:   112,862,810 bp
  Read 2:    75,858,610 bp
Total written (filtered):    167,383,121 bp (88.7%)
  Read 1:   102,168,436 bp
  Read 2:    65,214,685 bp

=== First read: Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 39 times

Minimum overlap: 8
No. of allowed errors:
1-5 bp: 0; 6-12 bp: 1; 13-19 bp: 2; 20 bp: 3

Bases preceding removed adapters:
  A: 0.0%
  C: 38.5%
  G: 61.5%
  T: 0.0%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
8	7	5.6	

Finished in 7.65 s (14 µs/read; 4.39 M reads/minute).

=== Summary ===

Total read pairs processed:            559,807
  Read 1 with adapter:                 554,474 (99.0%)
  Read 2 with adapter:                 548,879 (98.0%)

== Read fate breakdown ==
Pairs that were too short:                   8 (0.0%)
Pairs discarded as untrimmed:           16,102 (2.9%)
Pairs written (passing filters):       543,697 (97.1%)

Total basepairs processed:   285,501,570 bp
  Read 1:   170,741,135 bp
  Read 2:   114,760,435 bp
Total written (filtered):    253,467,691 bp (88.8%)
  Read 1:   154,706,569 bp
  Read 2:    98,761,122 bp

=== First read: Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 163 times

Minimum overlap: 8
No. of allowed errors:
1-5 bp: 0; 6-12 bp: 1; 13-19 bp: 2; 20 bp: 3

Bases preceding removed adapters:
  A: 0.0%
  C: 28.8%
  G: 69.9%
  T: 0.0%
  none/other: 1.2%

Overview of removed sequences
length	count	expect	max.err	error counts
8	12	8.

Finished in 4.95 s (14 µs/read; 4.19 M reads/minute).

=== Summary ===

Total read pairs processed:            345,762
  Read 1 with adapter:                 342,549 (99.1%)
  Read 2 with adapter:                 339,148 (98.1%)

== Read fate breakdown ==
Pairs that were too short:                   3 (0.0%)
Pairs discarded as untrimmed:            9,768 (2.8%)
Pairs written (passing filters):       335,991 (97.2%)

Total basepairs processed:   176,338,620 bp
  Read 1:   105,457,410 bp
  Read 2:    70,881,210 bp
Total written (filtered):    156,638,059 bp (88.8%)
  Read 1:    95,606,276 bp
  Read 2:    61,031,783 bp

=== First read: Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 67 times

Minimum overlap: 8
No. of allowed errors:
1-5 bp: 0; 6-12 bp: 1; 13-19 bp: 2; 20 bp: 3

Bases preceding removed adapters:
  A: 0.0%
  C: 17.9%
  G: 80.6%
  T: 1.5%
  none/other: 0.0%
    The adapter is preceded by 'G' extremely often.
    The provided adapter seq

Finished in 9.80 s (14 µs/read; 4.36 M reads/minute).

=== Summary ===

Total read pairs processed:            712,750
  Read 1 with adapter:                 706,137 (99.1%)
  Read 2 with adapter:                 700,151 (98.2%)

== Read fate breakdown ==
Pairs that were too short:                   8 (0.0%)
Pairs discarded as untrimmed:           19,096 (2.7%)
Pairs written (passing filters):       693,646 (97.3%)

Total basepairs processed:   363,502,500 bp
  Read 1:   217,388,750 bp
  Read 2:   146,113,750 bp
Total written (filtered):    323,379,504 bp (89.0%)
  Read 1:   197,382,010 bp
  Read 2:   125,997,494 bp

=== First read: Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 88 times

Minimum overlap: 8
No. of allowed errors:
1-5 bp: 0; 6-12 bp: 1; 13-19 bp: 2; 20 bp: 3

Bases preceding removed adapters:
  A: 0.0%
  C: 33.0%
  G: 67.0%
  T: 0.0%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
8	13	10.

Finished in 5.26 s (14 µs/read; 4.17 M reads/minute).

=== Summary ===

Total read pairs processed:            365,427
  Read 1 with adapter:                 362,073 (99.1%)
  Read 2 with adapter:                 358,323 (98.1%)

== Read fate breakdown ==
Pairs that were too short:                   8 (0.0%)
Pairs discarded as untrimmed:           10,379 (2.8%)
Pairs written (passing filters):       355,040 (97.2%)

Total basepairs processed:   186,367,770 bp
  Read 1:   111,455,235 bp
  Read 2:    74,912,535 bp
Total written (filtered):    165,521,152 bp (88.8%)
  Read 1:   101,031,128 bp
  Read 2:    64,490,024 bp

=== First read: Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 30 times

Minimum overlap: 8
No. of allowed errors:
1-5 bp: 0; 6-12 bp: 1; 13-19 bp: 2; 20 bp: 3

Bases preceding removed adapters:
  A: 0.0%
  C: 10.0%
  G: 90.0%
  T: 0.0%
  none/other: 0.0%
    The adapter is preceded by 'G' extremely often.
    The provided adapter seq

Finished in 1.49 s (15 µs/read; 3.87 M reads/minute).

=== Summary ===

Total read pairs processed:             95,852
  Read 1 with adapter:                  94,937 (99.0%)
  Read 2 with adapter:                  93,876 (97.9%)

== Read fate breakdown ==
Pairs that were too short:                   1 (0.0%)
Pairs discarded as untrimmed:            2,858 (3.0%)
Pairs written (passing filters):        92,993 (97.0%)

Total basepairs processed:    48,884,520 bp
  Read 1:    29,234,860 bp
  Read 2:    19,649,660 bp
Total written (filtered):     43,353,503 bp (88.7%)
  Read 1:    26,461,168 bp
  Read 2:    16,892,335 bp

=== First read: Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 54 times

Minimum overlap: 8
No. of allowed errors:
1-5 bp: 0; 6-12 bp: 1; 13-19 bp: 2; 20 bp: 3

Bases preceding removed adapters:
  A: 1.9%
  C: 61.1%
  G: 37.0%
  T: 0.0%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
8	16	1.5

Finished in 4.65 s (15 µs/read; 4.09 M reads/minute).

=== Summary ===

Total read pairs processed:            316,849
  Read 1 with adapter:                 313,871 (99.1%)
  Read 2 with adapter:                 310,260 (97.9%)

== Read fate breakdown ==
Pairs that were too short:                   3 (0.0%)
Pairs discarded as untrimmed:            9,492 (3.0%)
Pairs written (passing filters):       307,354 (97.0%)

Total basepairs processed:   161,592,990 bp
  Read 1:    96,638,945 bp
  Read 2:    64,954,045 bp
Total written (filtered):    143,285,520 bp (88.7%)
  Read 1:    87,458,002 bp
  Read 2:    55,827,518 bp

=== First read: Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 63 times

Minimum overlap: 8
No. of allowed errors:
1-5 bp: 0; 6-12 bp: 1; 13-19 bp: 2; 20 bp: 3

Bases preceding removed adapters:
  A: 0.0%
  C: 14.3%
  G: 84.1%
  T: 1.6%
  none/other: 0.0%
    The adapter is preceded by 'G' extremely often.
    The provided adapter seq

Finished in 3.58 s (14 µs/read; 4.17 M reads/minute).

=== Summary ===

Total read pairs processed:            248,647
  Read 1 with adapter:                 246,218 (99.0%)
  Read 2 with adapter:                 243,214 (97.8%)

== Read fate breakdown ==
Pairs that were too short:                   8 (0.0%)
Pairs discarded as untrimmed:            7,759 (3.1%)
Pairs written (passing filters):       240,880 (96.9%)

Total basepairs processed:   126,809,970 bp
  Read 1:    75,837,335 bp
  Read 2:    50,972,635 bp
Total written (filtered):    112,280,927 bp (88.5%)
  Read 1:    68,531,695 bp
  Read 2:    43,749,232 bp

=== First read: Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 113 times

Minimum overlap: 8
No. of allowed errors:
1-5 bp: 0; 6-12 bp: 1; 13-19 bp: 2; 20 bp: 3

Bases preceding removed adapters:
  A: 3.5%
  C: 42.5%
  G: 44.2%
  T: 9.7%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
8	21	3.

Finished in 2.95 s (14 µs/read; 4.22 M reads/minute).

=== Summary ===

Total read pairs processed:            207,454
  Read 1 with adapter:                 205,489 (99.1%)
  Read 2 with adapter:                 203,168 (97.9%)

== Read fate breakdown ==
Pairs that were too short:                   4 (0.0%)
Pairs discarded as untrimmed:            6,212 (3.0%)
Pairs written (passing filters):       201,238 (97.0%)

Total basepairs processed:   105,801,540 bp
  Read 1:    63,273,470 bp
  Read 2:    42,528,070 bp
Total written (filtered):     93,813,343 bp (88.7%)
  Read 1:    57,261,839 bp
  Read 2:    36,551,504 bp

=== First read: Adapter 1 ===

Sequence: ATTAGAWACCCBDGTAGTCC; Type: regular 3'; Length: 20; Trimmed: 53 times

Minimum overlap: 8
No. of allowed errors:
1-5 bp: 0; 6-12 bp: 1; 13-19 bp: 2; 20 bp: 3

Bases preceding removed adapters:
  A: 3.8%
  C: 47.2%
  G: 49.1%
  T: 0.0%
  none/other: 0.0%

Overview of removed sequences
length	count	expect	max.err	error counts
8	12	3.2