# Usecase 1: Age prediction


TODOs:
* transform shell scripts into Python API scripts

## Setup

In [1]:
import os

import numpy as np
import pandas as pd
import qiime2 as q2

import src.meta_proc_subr14 as proc_subr
from src.meta_fetch import _fetch_all_supp_material, _fetch_sra_metadata, save_file
from src.seq_fetch_n_process import (
    cluster_wq_sequences,
    fetch_sequences,
    filter_sequences,
    rarefy_sequences_w_fixed_seed,
)
from src.seq_trim import trim_sequences
from qiime2.plugins import demux

%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
######## USER INPUTS ########
bioproject_id = "PRJEB5482"
path_to_data = "../data/u1_subramanian14"
email = "my@mail.com"
n_jobs = 6
tag = "01"
seed = 148
######## END USER INPUTS #####

In [3]:
if not os.path.exists(path_to_data):
    os.makedirs(path_to_data)

## Fetch and process metadata

In [4]:
# fetch SRA metadata (takes ~3 min)
sra_ids = pd.Series([bioproject_id], name="ID")
ids = q2.Artifact.import_data("NCBIAccessionIDs", sra_ids)

md_sra = _fetch_sra_metadata(path_to_data, ids, email, n_jobs)
md_sra = proc_subr._process_sra_metadata(md_sra)

# fetch supp. material
url_supp = (
    "https://static-content.springer.com/esm/"
    "art%3A10.1038%2Fnature13421/MediaObjects/"
    "41586_2014_BFnature13421_MOESM97_ESM.xlsx"
)
path2supp = _fetch_all_supp_material(path_to_data, url_supp)
md_supp = proc_subr.process_supp_metadata(path2supp)

  for idx, row in parser.parse():


Shape before merge tab4: (996, 13)
Shape before merge tab1: (50, 2)
Shape after merge: (996, 14)


  for idx, row in parser.parse():
  tab4_df.loc[
  tab4_df.loc[
  tab4_df["abx_7d_prior"] = tab4_df["abx_7d_prior"].replace(
  df.loc[df[host_id].isin(host_abx), "abx_ever"] = True
  for idx, row in parser.parse():


In [None]:
# merge
md_all = md_sra.merge(md_supp, how="left", on="sample_id")
md_all = proc_subr._postprocess_all_metadata(md_all)

# save to file
path_to_md = save_file(md_all, path_to_data, tag)

# get number of samples
nb_samples = md_all.shape[0]

print(md_all.shape)
md_all.head()

## Fetch and process sequences

Following the general approach outlined in [the original publication by Subramanian et al. 2014](https://doi.org/10.1038/nature13421), namely:
1) fetching sequences from NCBI SRA
2) trim sequences to at most 162 nucleotide length and overlap forward and reverse reads
3) clustering sequences sharing >= 97% identity matched to the 13_8 99% Greengenes reference and remaining sequences were clustered de novo
4) filtering such that only OTUs present at or above a level of confident detection (0.1% relative abundance) in at least two fecal samples.
5) rarefaction of resulting OTU table at 2'000 sequences per sample


#### 1. fetch

In [None]:
fetch_sequences(n_jobs, path_to_data)

In [None]:
# check size
path_to_paired = "../data/u1_subramanian14/PRJEB5482/paired_reads.qza"
paired_reads = q2.Artifact.load(os.path.join(path_to_paired))

(sum_paired,) = demux.actions.summarize(data=paired_reads)
path_summary = path_to_paired.replace(".qza", "_summary.qzv")
sum_paired.save(path_summary)
print(f"Saved paired summary in: {path_summary}")

448 samples with 18858.3 median forward and reverse reads

#### 2. trim

In [None]:
trim_sequences(path2md=path_to_md, path2seq=path_to_data, threads=n_jobs)

In [None]:
# check size
path_to_trim = "../data/u1_subramanian14/trimmed_subramanian14.qza"
trimmed_reads = q2.Artifact.load(os.path.join(path_to_trim))

(sum_reads,) = demux.actions.summarize(data=trimmed_reads)
path_summary = path_to_trim.replace(".qza", "_summary.qzv")
sum_reads.save(path_summary)
print(f"Saved demux summary in: {path_summary}")

448 samples with 18349.5 median forward and reverse reads

#### 3. cluster

In [None]:
cluster_wq_sequences(path_to_data=path_to_data, n_threads=n_jobs)

In [None]:
# check size
path_to_otu = "../data/u1_subramanian14/otu_table_subr14_wq.qza"
otu_table = q2.Artifact.load(os.path.join(path_to_otu))
otu_table.view(pd.DataFrame).shape

#### 4. filter

In [None]:
filter_sequences(path_to_data=path_to_data, min_prevalence=2 / nb_samples)

In [None]:
# check size
path_to_otu = "../data/u1_subramanian14/otu_table_subr14_filt.qza"
otu_table = q2.Artifact.load(os.path.join(path_to_otu))
otu_table.view(pd.DataFrame).shape

#### 5. rarefy

In [None]:
path_to_otu = os.path.join(path_to_data, "otu_table_subr14_filt.qza")
rarefied_table = rarefy_sequences_w_fixed_seed(path_to_otu=path_to_otu, seed=seed)

# assert that rarefaction worked
assert np.unique(rarefied_table.sum(axis="sample"))[0] == 2000

# save to file
path_to_rar = os.path.join(path_to_data, "otu_table_subr14_rar.tsv")
df_rarefied_table = rarefied_table.to_dataframe().transpose()
df_rarefied_table.to_csv(path_to_rar, sep="\t")

df_rarefied_table.shape

number of samples here: (850, 448)

## Merge metadata and sequences

In [None]:
print(df_rarefied_table.shape)
print(md_all.shape)
ft_merged = pd.merge(md_all, df_rarefied_table, left_index=True, right_index=True)
print(ft_merged.shape)

In [None]:
# save
output_filename = os.path.join(path_to_data, "ft_subr14_sra_subset.tsv")
ft_merged.to_csv(output_filename, sep="\t")
print(f"Saved feature table to {output_filename}")

## Describe

In [None]:
ft_merged.study_subcohort.value_counts(dropna=False)

In [None]:
ft_merged.host_id.nunique()

This study did not upload all sequences needed to reproduce their RF training and testing, that would result in 50 unique infants. In the study's original train set, 12 healthy infants with 1,222 97%-identity OTUs were used to train the model and 25 twins & triplets and 13 singletons were then used to test it. However, only a fraction (25 out of 38 infants) from the test set were actually uploaded to SRA.