# Usecase 1: Age prediction


## Setup

In [1]:
import os

import pandas as pd
import qiime2 as q2

import src.meta_proc_subr14 as proc_subr
from src.meta_fetch import _fetch_all_supp_material, _fetch_sra_metadata

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
######## USER INPUTS ########
bioproject_id = "PRJEB5482"
path2data = "../data/u1_subramanian14"
email = "my@mail.com"
n_jobs = 6
######## END USER INPUTS #####

In [3]:
if not os.path.exists(path2data):
    os.makedirs(path2data)

## Fetch and process metadata

In [None]:
# fetch SRA metadata (takes ~3 min)
sra_ids = pd.Series([bioproject_id], name="ID")
ids = q2.Artifact.import_data("NCBIAccessionIDs", sra_ids)

md_sra = _fetch_sra_metadata(path2data, ids, email, n_jobs)
md_sra = proc_subr._process_sra_metadata(md_sra)

# fetch supp. material
url_supp = (
    "https://static-content.springer.com/esm/"
    "art%3A10.1038%2Fnature13421/MediaObjects/"
    "41586_2014_BFnature13421_MOESM97_ESM.xlsx"
)
path2supp = _fetch_all_supp_material(path2data, url_supp)
md_supp = proc_subr.process_supp_metadata(path2supp)

In [None]:
# merge
md_all = md_sra.merge(md_supp, how="left", on="sample_id")
md_all = proc_subr._postprocess_all_metadata(md_all)
print(md_all.shape)
md_all.head()

## Fetch and process sequences

Following the general approach outlined in [the original publication by Subramanian et al. 2014](https://doi.org/10.1038/nature13421), namely:
1) fetching sequences from NCBI SRA
". Reads of 250 nucleotides in length were trimmed to 162 nucleotides, then all reads were processed using previously described custom scripts, and overlapped to 253-nucleotide fragments spanning the entire V4 amplicon15."
"Faith, J. J. et al. The long-term stability of the human gut microbiota. S"

2) clustering sequences sharing >= 97% identity matched to the Greengenes reference and remaining sequences were clustered de novo
3) filtering such that only OTUs present at or above a level of confident detection (0.1% relative abundance) in at least two fecal samples.
4) rarefaction of resulting OTU table at 2'000 sequences per sample


In [None]:
# for rarefying with random seed: https://github.com/biocore/biom-format/pull/916/files