# Prepare data for Pipeline

## Setup and settings

In [1]:
# Importing packages
import os
import yaml
import logging
import pandas as pd
from Bio.Seq import Seq
from qiime2 import Artifact, Visualization
from qiime2.plugins import demux
from qiime2.plugins.cutadapt.methods import trim_paired

### Receiving the parameters

The following cell can receive parameters using the [papermill](https://papermill.readthedocs.io/en/latest/) tool.

In [None]:
params_path = os.path.join('..', 'params', 'ana-flavia-superlactacao.yaml')
experiment_name = 'jenneffer-vs-01'
base_dir = os.path.join('/', 'home', 'lauro', 'nupeb', 'redemicro')
manifest_file = os.path.join(base_dir, 'data', 'raw', 'manifest', 'not-hist-vs-manifest.csv')
img_folder = os.path.abspath(os.path.join(base_dir, 'imgs'))
replace_files = False
trim = None

In [None]:
# Setting hardcoded parameters for development and debbuging
production = True
if not production:
    params_path = os.path.join('..', 'params', 'ana-flavia-hipotese-01')
    with open(params_path, 'r') as stream:
        params = yaml.safe_load(stream)
        experiment_name = params['experiment_name']
        base_dir = params['base_dir']
        manifest_file = params['manifest_file']
        replace_files = params['replace_files']

### Defining names and paths

In [None]:
# new_manifest = '/home/lauro/nupeb/redemicro/data/raw/manifest/karina-manifest.csv'
# with open(manifest_file, 'r') as oldm, open(new_manifest, 'w') as newm:
#     header = 'sample-id,absolute-filepath,direction\n'
#     newm.write(header)
#     for line in oldm.readlines()[1:]:
#         sid, forward, reverse = line[:-1].split(',')
#         fline = ','.join((sid, forward, 'forward')) + '\n'
#         rline = ','.join((sid, reverse, 'reverse')) + '\n'
#         newm.write(fline)
#         newm.write(rline)

In [None]:
# Define the output folder path
out_dir = os.path.join(base_dir, 'experiments', experiment_name, 'qiime-artifacts')
img_folder = os.path.abspath(os.path.join(base_dir, 'experiments', experiment_name, 'imgs'))

# Create path if it not exist
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
    print(f'New qiime-artifacts folder path created: {out_dir}')

# Create path if it not exist
if not os.path.isdir(img_folder):
    os.makedirs(img_folder)
    print(f'New img folder path created: {img_folder}')

# Define the output artifact full path
demux_file = os.path.join(out_dir, 'demux-paired.qza')
demux_view = os.path.join(out_dir, 'demux-paired.qzv')
demux_file_trim = os.path.join(out_dir, 'demux-paired-trim.qza')
demux_view_trim = os.path.join(out_dir, 'demux-paired-trim.qzv')

## Step execution

This Step import all `fastq` files in a **QIIME2 Artifact** object and save it to a new `qza` file of `SampleData[PairedEndSequencesWithQuality]`

In [None]:
manifest_df = pd.read_csv(manifest_file)
n_directions = len(manifest_df['direction'].unique())
if n_directions == 1:
    d_type = 'SampleData[SequencesWithQuality]'
    v_type = 'SingleEndFastqManifestPhred33'
elif n_directions == 2:
    d_type = 'SampleData[PairedEndSequencesWithQuality]'
    v_type = 'PairedEndFastqManifestPhred33'
else:
    print(f'ERROR: invalid number of directions {n_directions}')

In [None]:
# If the demux file does not exist or if the old file will be replaced
if not os.path.isfile(demux_file) or replace_files:
    
    # Import data and create an artifact object
    artifact = Artifact.import_data(
        d_type, 
        manifest_file, 
        view_type=v_type)
    
    # Save the artifact object to a new qza file
    artifact.save(demux_file)

else:
    artifact = Artifact.load(demux_file)

In [None]:
if not os.path.isfile(demux_view) or replace_files: 
    # Generate e visualization of the Artifact
    demux_summary = demux.visualizers.summarize(artifact)

    # Save a new visualization file based on the qza file
    Visualization.save(demux_summary.visualization, filepath=demux_view)
    
    demux_view_obj = demux_summary.visualization
else:
    demux_view_obj = Visualization.load(demux_view)

## Step report


In [None]:
print(demux_view_obj)

In [None]:
# Render Visualization
demux_view_obj

[cutadapt](https://docs.qiime2.org/2022.2/plugins/available/cutadapt/trim-paired/)

We trim the forward primer and the reverse complement of the reverse primer from the forward reads. We trim the reverse primer and reverse complement of the forward primer from the reverse reads.

In [None]:
demux_file_trim = os.path.join(out_dir, 'demux-paired-trim.qza')
demux_view_trim = os.path.join(out_dir, 'demux-paired-trim.qzv')

if trim and (not os.path.isfile(demux_file_trim) or replace_files):
    forward_primer = [trim['forward_primer']] # ['CCTACGGGRSGCAGCAG']
    reverse_primer = [trim['reverse_primer']] # ['GGACTACHVGGGTWTCTAAT']
    forward_reverse_complement = [str(Seq(forward_primer[0]).reverse_complement())]
    reverse_reverse_complement = [str(Seq(reverse_primer[0]).reverse_complement())]

    res = trim_paired(
        demultiplexed_sequences=artifact,
        front_f=forward_primer,
        front_r=reverse_primer,
        adapter_f=reverse_reverse_complement,
        adapter_r=forward_reverse_complement,
        cores=threads,
        overlap=trim['overlap'],
        indels=False,
        match_read_wildcards=True,
        match_adapter_wildcards=True,
        error_rate=0.15,
        discard_untrimmed=True,
    ).trimmed_sequences

    demux.visualizers.summarize(res).visualization
    res.save(demux_file_trim)
    Visualization.save(demux.visualizers.summarize(res).visualization, filepath=demux_view_trim)