In [2]:
import os
import sys
import pandas as pd
import yaml

root_path = os.path.abspath(os.path.join(os.getcwd()))

In [15]:
# PARAMS

# New base forlder with fastq files
fastq_folder = os.path.join(os.sep, 'mnt', 'nupeb', 'rede-micro', 'datasets', 'karina-16S', 'datasets', 'decompressed_reads')

# YAML file with parameters to use as template
params_template = os.path.join(root_path, 'params', 'ana-flavia-all.yaml')

# Classifier QZA path
classifier_path = os.path.join(os.sep,'mnt','nupeb','rede-micro','datasets','16S_classifiers_qiime2','silva-138-99-nb-classifier.qza')

# Groups to be selected
selected_groups = ('STD-NC', 'STD-NR', 'HSD-NC', 'HSD-NR') # ('NC', 'NR', 'STD-NC', 'STD-NR', 'HSD-NC', 'HSD-NR')

selected_txt = 'x'.join(selected_groups)
print(f'Selected groups: {selected_txt}')

Selected groups: STD-NCxSTD-NRxHSD-NCxHSD-NR


In [4]:
!ls {params_template}

/mnt/nupeb/rede-micro/redemicro-ana-flavia-nutri/params/ana-flavia-all.yaml


In [5]:
!ls /mnt/nupeb/rede-micro/

andressa_lbtm
bkp_andressa
datasets
esther_download.ipynb
fastqs
hfd-mv
microbiom
qiime2-2021.11-py38-linux-conda.yml
qiime2-2022.2-py38-linux-conda.yml
qiime2-2023.5-py38-linux-conda.yml
qiime2-amplicon-2024.2-py38-linux-conda.yml
qiime2-classifiers
redemicro-ana-flavia-nutri
redemicro-jennefer
redemicro-miliane-nutri
redemicro-thayane


In [6]:
# Get metadata file
metadata_path = os.path.join(root_path, 'data', 'raw', 'metadata-all-ana.tsv')
metadata_df = pd.read_csv(metadata_path, sep='\t')
groups_ids = metadata_df['group-id'].unique().tolist()

print(f'ALL GROUPS: {groups_ids}')

ALL GROUPS: ['NC', 'NR', 'STD-NC', 'STD-NR', 'HSD-NC', 'HSD-NR']


In [7]:
mask = metadata_df['group-id'].isin(selected_groups)
new_metadata_df = metadata_df.loc[mask, :].reset_index(drop=True)
sample_ids = new_metadata_df['sample-id']

new_metadata_df.head()

Unnamed: 0,sample-id,sample-name,group-id,group-desc
0,S210421121682,N7.01,STD-NC,Grupo dieta padrão
1,S210421121683,N7.02,STD-NC,Grupo dieta padrão
2,S210421121684,N8.01,STD-NC,Grupo dieta padrão
3,S210421121685,N7.03,STD-NR,Grupo dieta padrão
4,S210421121686,N7.04,STD-NR,Grupo dieta padrão


In [8]:
# Get manifest
manifest_path = os.path.join(root_path, 'data', 'raw', 'manifest-all-ana.csv')
manifest_df = pd.read_csv(manifest_path)
file_names = manifest_df.loc[:, 'absolute-filepath'].str.split('/').str[-1]
new_paths = fastq_folder + os.path.sep + file_names

In [9]:
manifest_df['absolute-filepath'] = new_paths
manifest_df

Unnamed: 0,sample-id,absolute-filepath,direction
0,S210421121673,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,forward
1,S210421121673,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,reverse
2,S210421121674,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,forward
3,S210421121674,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,reverse
4,S210421121675,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,forward
...,...,...,...
143,S210707163915,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,reverse
144,S210707163916,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,forward
145,S210707163916,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,reverse
146,S210707163917,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,forward


In [10]:
new_manifest_df = manifest_df[manifest_df['sample-id'].isin(sample_ids)].reset_index(drop=True)
new_manifest_df.head()

Unnamed: 0,sample-id,absolute-filepath,direction
0,S210421121682,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,forward
1,S210421121682,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,reverse
2,S210421121683,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,forward
3,S210421121683,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,reverse
4,S210421121684,/mnt/nupeb/rede-micro/datasets/karina-16S/data...,forward


In [11]:
output_folder = os.path.join(os.path.dirname(manifest_path), 'selected_groups')

if not os.path.isdir(output_folder):
    os.makedirs(output_folder)

new_manifest_path = os.path.join(output_folder, f'manifest_{selected_txt}.csv')
new_metadata_path = os.path.join(output_folder, f'metadata_{selected_txt}.tsv')

# Write the new manifest file
new_manifest_df.to_csv(new_manifest_path, index=False)

# Write the new metadata file
new_metadata_df.to_csv(new_metadata_path, index=False, sep='\t')

In [12]:
with open(params_template, 'r') as file:
    params = yaml.safe_load(file)
params

{'experiment_name': 'ana-flavia-all',
 'base_dir': '/mnt/nupeb/rede-micro/redemicro-ana-flavia-nutri',
 'manifest_file': '/mnt/nupeb/rede-micro/redemicro-ana-flavia-nutri/data/raw/manifest-all-ana.csv',
 'metadata_file': '/mnt/nupeb/rede-micro/redemicro-ana-flavia-nutri/data/raw/metadata-all-ana.tsv',
 'class_col': 'group-id',
 'classifier_file': '/mnt/nupeb/rede-micro/datasets/16S_classifiers_qiime2/silva-138-99-nb-classifier.qza',
 'top_n': 20,
 'replace_files': False,
 'phred': 20,
 'trunc_f': 0,
 'trunc_r': 0,
 'overlap': 12,
 'threads': 6,
 'trim': {'overlap': 8,
  'forward_primer': 'CCTACGGGRSGCAGCAG',
  'reverse_primer': 'GGACTACHVGGGTWTCTAAT'}}

In [34]:
exp_name = f'ana-flavia-{selected_txt}'

params['experiment_name'] = exp_name
params['base_dir'] = root_path
params['manifest_file'] = new_manifest_path
params['metadata_file'] = new_metadata_path
params['classifier_file'] = classifier_path


new_params_file_path = os.path.join(os.path.dirname(params_template), f'{exp_name}.yaml')
print(new_params_file_path)

with open(new_params_file_path, 'w') as file:
    yaml.dump(params, file)

# with open(new_params_file_path, 'r') as file:
#     params = yaml.safe_load(file)

/mnt/nupeb/rede-micro/redemicro-ana-flavia-nutri/params/ana-flavia-STD-NCxSTD-NRxHSD-NCxHSD-NR.yaml
