# Usecase 1: Age prediction


## Setup

In [1]:
import os

import pandas as pd
import qiime2 as q2

import src.meta_proc_subr14 as proc_subr
from src.seq_fetch_n_process import (
    cluster_sequences,
    fetch_sequences,
    filter_sequences,
    rarefy_sequences_w_fixed_seed,
)
from src.meta_fetch import _fetch_all_supp_material, _fetch_sra_metadata, save_file
from src.seq_trim import trim_sequences

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
######## USER INPUTS ########
bioproject_id = "PRJEB5482"
path_to_data = "../data/u1_subramanian14"
email = "my@mail.com"
n_jobs = 6
tag = "01"
seed = 148
######## END USER INPUTS #####

In [3]:
if not os.path.exists(path_to_data):
    os.makedirs(path_to_data)

## Fetch and process metadata

In [None]:
# fetch SRA metadata (takes ~3 min)
sra_ids = pd.Series([bioproject_id], name="ID")
ids = q2.Artifact.import_data("NCBIAccessionIDs", sra_ids)

md_sra = _fetch_sra_metadata(path_to_data, ids, email, n_jobs)
md_sra = proc_subr._process_sra_metadata(md_sra)

# fetch supp. material
url_supp = (
    "https://static-content.springer.com/esm/"
    "art%3A10.1038%2Fnature13421/MediaObjects/"
    "41586_2014_BFnature13421_MOESM97_ESM.xlsx"
)
path2supp = _fetch_all_supp_material(path_to_data, url_supp)
md_supp = proc_subr.process_supp_metadata(path2supp)

In [None]:
# merge
md_all = md_sra.merge(md_supp, how="left", on="sample_id")
md_all = proc_subr._postprocess_all_metadata(md_all)

# save to file
path_to_md = save_file(md_all, path_to_data, tag)

# get number of samples
nb_samples = md_all.shape[0]

print(md_all.shape)
md_all.head()

## Fetch and process sequences

Following the general approach outlined in [the original publication by Subramanian et al. 2014](https://doi.org/10.1038/nature13421), namely:
1) fetching sequences from NCBI SRA
2) trim sequences to at most 162 nucleotide length and overlap forward and reverse reads
3) clustering sequences sharing >= 97% identity matched to the 13_8 99% Greengenes reference and remaining sequences were clustered de novo
4) filtering such that only OTUs present at or above a level of confident detection (0.1% relative abundance) in at least two fecal samples.
5) rarefaction of resulting OTU table at 2'000 sequences per sample


#### 1. fetch

In [None]:
fetch_sequences(n_jobs, path_to_data)

#### 2. trim

In [None]:
trim_sequences(path2md=path_to_md, path2seq=path_to_data, threads=n_jobs)

#### 3. cluster

In [None]:
cluster_sequences(path_to_data=path_to_data, n_threads=n_jobs)

#### 4. filter

In [None]:
filter_sequences(path_to_data=path_to_data, min_prevalence=2 / nb_samples)

#### 5. rarefy

In [None]:
# for rarefying with fixed random seed:
# https://github.com/biocore/biom-format/pull/916/files

In [None]:
path_to_otu = os.path.join(path_to_data, "otu_table_subr14_filt_rel.qza")
rarefied_table = rarefy_sequences_w_fixed_seed(path_to_otu=path_to_otu, seed=seed)