# Usecase 1: Age prediction


## Setup

In [1]:
import os

import numpy as np
import pandas as pd
import qiime2 as q2

import src.meta_proc_subr14 as proc_subr
from src.meta_fetch import _fetch_all_supp_material, _fetch_sra_metadata, save_file
from src.seq_fetch_n_process import (
    cluster_sequences,
    fetch_sequences,
    filter_sequences,
    rarefy_sequences_w_fixed_seed,
)
from src.denoise_sequences import denoise_sequences
from src.seq_trim import trim_sequences

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
######## USER INPUTS ########
bioproject_id = "PRJEB5482"
path_to_data = "../data/u1_subramanian14"
email = "my@mail.com"
n_jobs = 6
tag = "01"
seed = 148
######## END USER INPUTS #####

In [3]:
if not os.path.exists(path_to_data):
    os.makedirs(path_to_data)

## Fetch and process metadata

In [4]:
# fetch SRA metadata (takes ~3 min)
sra_ids = pd.Series([bioproject_id], name="ID")
ids = q2.Artifact.import_data("NCBIAccessionIDs", sra_ids)

md_sra = _fetch_sra_metadata(path_to_data, ids, email, n_jobs)
md_sra = proc_subr._process_sra_metadata(md_sra)

# fetch supp. material
url_supp = (
    "https://static-content.springer.com/esm/"
    "art%3A10.1038%2Fnature13421/MediaObjects/"
    "41586_2014_BFnature13421_MOESM97_ESM.xlsx"
)
path2supp = _fetch_all_supp_material(path_to_data, url_supp)
md_supp = proc_subr.process_supp_metadata(path2supp)

Metadata was read from file ../data/u1_subramanian14/metadata.qza


  for idx, row in parser.parse():


Shape before merge tab4: (996, 13)
Shape before merge tab1: (50, 2)
Shape after merge: (996, 14)


  for idx, row in parser.parse():
  tab4_df.loc[
  tab4_df.loc[
  tab4_df["abx_7d_prior"] = tab4_df["abx_7d_prior"].replace(
  df.loc[df[host_id].isin(host_abx), "abx_ever"] = True
  for idx, row in parser.parse():


In [5]:
# merge
md_all = md_sra.merge(md_supp, how="left", on="sample_id")
md_all = proc_subr._postprocess_all_metadata(md_all)

# save to file
path_to_md = save_file(md_all, path_to_data, tag)

# get number of samples
nb_samples = md_all.shape[0]

print(md_all.shape)
md_all.head()

Saved processed metadata to: ../data/u1_subramanian14/metadata_proc_v01.tsv
Saved unique project IDs to: ../data/u1_subramanian14/runids/PRJEB5482.tsv
(2811, 38)


Unnamed: 0_level_0,experiment_id,biosample_id,bioproject_id,study_id,sample_accession,library_layout,instrument,platform,public,geo_location_name,...,health_status_at_sampling,diet_milk,diet_weaning,abx_ever,zygosity,age_months,age_months_rounded05,age_months_rounded1,study_name,study_cohort_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERR500819,ERX466176,SAMEA2470685,PRJEB5482,ERP004898,ERS440148,PAIRED,Illumina MiSeq,ILLUMINA,True,"Bangladesh, Dhaka, Mirpur",...,healthy,bd,False,False,no twins,0.098564,0.0,0.0,subramanian14,subramanian14
ERR500823,ERX466180,SAMEA2470686,PRJEB5482,ERP004898,ERS440149,PAIRED,Illumina MiSeq,ILLUMINA,True,"Bangladesh, Dhaka, Mirpur",...,healthy,bd,False,False,no twins,1.511318,1.5,2.0,subramanian14,subramanian14
ERR500824,ERX466181,SAMEA2470687,PRJEB5482,ERP004898,ERS440150,PAIRED,Illumina MiSeq,ILLUMINA,True,"Bangladesh, Dhaka, Mirpur",...,healthy,bd,False,False,no twins,2.365542,2.5,2.0,subramanian14,subramanian14
ERR500825,ERX466182,SAMEA2470688,PRJEB5482,ERP004898,ERS440151,PAIRED,Illumina MiSeq,ILLUMINA,True,"Bangladesh, Dhaka, Mirpur",...,healthy,bd,False,False,no twins,3.384039,3.5,3.0,subramanian14,subramanian14
ERR500826,ERX466183,SAMEA2470689,PRJEB5482,ERP004898,ERS440152,PAIRED,Illumina MiSeq,ILLUMINA,True,"Bangladesh, Dhaka, Mirpur",...,healthy,bd,False,False,no twins,3.844006,4.0,4.0,subramanian14,subramanian14


## Fetch and process sequences
We denoised the sequences with dada2 before following the approach outlined in [the original publication by Subramanian et al. 2014](https://doi.org/10.1038/nature13421), namely:
1) fetching sequences from NCBI SRA
2) denoise sequences with dada2
3) clustering sequences sharing >= 97% identity matched to the 13_8 99% Greengenes reference and cluster remaining sequences de novo
4) filtering such that only OTUs present at or above a level of confident detection (=0.1% relative abundance) in at least two fecal samples.
5) rarefaction of resulting OTU table at 2'000 sequences per sample


#### 1. fetch

In [6]:
fetch_sequences(n_jobs, path_to_data)

Analysing PRJEB5482 ...
Imported ../data/u1_subramanian14/runids/PRJEB5482.tsv as NCBIAccessionIDsDirFmt to ../data/u1_subramanian14/runids/PRJEB5482.qza
../data/u1_subramanian14/PRJEB5482 found - not fetching again
...finished fetching sequences of PRJEB5482!


#### 2. denoise (7 min)

In [7]:
path_trunc_len = f"{path_to_data}/trunc_len.json"
denoise_sequences(
    path2md=path_to_md,
    path2trunc_len=path_trunc_len,
    path2seq=path_to_data,
    threads=n_jobs,
)

Denoising: subramanian14...
Running external command line application(s). This may print messages to stdout and/or stderr.
The command(s) being run are below. These commands cannot be manually re-run as they will depend on temporary files that no longer exist.

Command: run_dada.R --input_directory /var/folders/k3/lydcv_vx7cb6tzmkf7k8xvsw0000gp/T/tmp0rknptxj/forward --input_directory_reverse /var/folders/k3/lydcv_vx7cb6tzmkf7k8xvsw0000gp/T/tmp0rknptxj/reverse --output_path /var/folders/k3/lydcv_vx7cb6tzmkf7k8xvsw0000gp/T/tmp0rknptxj/output.tsv.biom --output_track /var/folders/k3/lydcv_vx7cb6tzmkf7k8xvsw0000gp/T/tmp0rknptxj/track.tsv --filtered_directory /var/folders/k3/lydcv_vx7cb6tzmkf7k8xvsw0000gp/T/tmp0rknptxj/filt_f --filtered_directory_reverse /var/folders/k3/lydcv_vx7cb6tzmkf7k8xvsw0000gp/T/tmp0rknptxj/filt_r --truncation_length 139 --truncation_length_reverse 127 --trim_left 0 --trim_left_reverse 0 --max_expected_errors 2.0 --max_expected_errors_reverse 2.0 --truncation_quality_

Lade nötiges Paket: Rcpp


DADA2: 1.30.0 / Rcpp: 1.0.13 / RcppParallel: 5.1.9 
2) Filtering ................................................................................................................................................................................................................................................................................................................................................................................................................................................................
3) Learning Error Rates
139150676 total bases in 1001084 reads from 173 samples will be used for learning the error rates.
127137668 total bases in 1001084 reads from 173 samples will be used for learning the error rates.
3) Denoise samples .....................................................................................................................................................................................................................................................

#### 3. cluster (2.5 min)

In [8]:
cluster_sequences(path_to_data=path_to_data, n_threads=n_jobs)

../data/u1_subramanian14/gg_13_8_99_otus.qza found - not fetching again
Clustering sequences ...
Saved FeatureTable[Frequency] to: ../data/u1_subramanian14/otu_table_subr14.qza
Saved FeatureData[Sequence] to: ../data/u1_subramanian14/otu_seq_subr14.qza
Saved FeatureData[Sequence] to: ../data/u1_subramanian14/otu_seq_subr14_new_ref.qza


#### 4. filter

In [10]:
filter_sequences(path_to_data=path_to_data, min_prevalence=2 / nb_samples)

Filtering sequences ...
Saved FeatureTable[Frequency] to: ../data/u1_subramanian14/otu_table_subr14_filt.qza


#### 5. rarefy

In [10]:
# for rarefying with fixed random seed:
# https://github.com/biocore/biom-format/pull/916/files

In [11]:
path_to_otu = os.path.join(path_to_data, "otu_table_subr14_filt.qza")
rarefied_table = rarefy_sequences_w_fixed_seed(path_to_otu=path_to_otu, seed=seed)

# assert that rarefaction worked
assert np.unique(rarefied_table.sum(axis="sample"))[0] == 2000

# save to file
path_to_rar = os.path.join(path_to_data, "otu_table_subr14_rar.tsv")
rarefied_table.to_dataframe().to_csv(path_to_rar, sep="\t")

rarefied_table.shape

In [12]:
rarefied_table.shape

(679, 333)

check if we are close with original publication: 1,222 97%-identity OTUs were found

number of samples here: (679, 333)