# Usecase 3: Microbiome load prediction data preparation

This notebook prepares the dataset for the microbiome load prediction usecase following the general data preparation approach outlined in [the original publication by Nishijima et al. 2024](10.1016/j.cell.2024.10.022). It can be run in the following conda environment:

This notebook can be run in the following conda environment (last command must be launched from root of his repos):
```shell
mamba env create -f environment_prep_data.yml
conda activate ritme_examples_prep_data
pip install -e .
qiime dev refresh-cache
```

## Setup

In [1]:
import numpy as np
import pandas as pd
import qiime2 as q2

from src.process_u3 import process_feature_table

%load_ext autoreload
%autoreload 2

%matplotlib inline

  import pkg_resources


## Fetch data

In [2]:
! ./../../src/fetch_mlp_data.sh

Data already fetched in ../../data/u3_mlp_nishijima24, skipping.


In [3]:
path_to_data = "../../data/u3_mlp_nishijima24"

## Create Galaxy dataset

### Metadata

In [4]:
galaxy_md = pd.read_csv(f"{path_to_data}/GALAXY_load.tsv", sep="\t", index_col=0)
galaxy_md["count_log10"] = np.log10(galaxy_md["count"])

print(galaxy_md.shape)
galaxy_md.head()

(1894, 3)


Unnamed: 0_level_0,count,cohort,count_log10
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALCO_PYGJU,59046650000.0,AlcoChallenge,10.771195
ALCO_ZYTNB,80143750000.0,AlcoChallenge,10.90387
ALCO_QXCHU,59725960000.0,AlcoChallenge,10.776163
ALCO_GNRDS,87255750000.0,AlcoChallenge,10.940794
ALCO_YVDGT,54209860000.0,AlcoChallenge,10.734078


In [5]:
# save to disk
galaxy_md.to_csv(f"{path_to_data}/md_galaxy.tsv", sep="\t")

### Feature table

In [6]:
galaxy_motus = process_feature_table(path_to_data, "GALAXY_mOTUs_v25")
print(galaxy_motus.shape)

# # save to disk
# galaxy_motus.to_csv(f"{path_to_data}/galaxy_otu_table.tsv", sep="\t")
# galaxy_motus.head()

Original shape (1894, 14213)
(1894, 14213)


In [7]:
# ls_tup = [(x, len(x)) for x in galaxy_motus.columns.tolist()]
set([len(x) for x in galaxy_motus.columns.tolist()])

{12, 18, 19}

In [8]:
# are they relative abundances?
assert galaxy_motus.sum(axis=1).round(4).eq(1.0).all()

In [9]:
# check are all sample IDs present in metadata?
assert len([x for x in galaxy_motus.index if x not in galaxy_md.index]) == 0
assert len([x for x in galaxy_md.index if x not in galaxy_motus.index]) == 0

### Taxonomy

In [10]:
taxonomy_mapping = pd.read_csv(
    "../../data/u3_mlp_nishijima24/motus2GTDB.txt", sep="\t", index_col=0
)

# remove empty spaces from column values
for col in taxonomy_mapping.columns:
    taxonomy_mapping[col] = taxonomy_mapping[col].str.replace(" ", "_")

taxonomy_mapping.head()

Unnamed: 0_level_0,Kingdom,Phylum,Class,Order,Family,Genus,Species
mOTU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Bacteria,Spirochaetota,Leptospirae,Leptospirales,Leptospiraceae,Leptospira,Leptospira_alexanderi
2,Bacteria,Spirochaetota,Leptospirae,Leptospirales,Leptospiraceae,Leptospira,Leptospira_weilii
3,Bacteria,Spirochaetota,Leptospirae,Leptospirales,Leptospiraceae,Leptospira,Leptospira_weilii
4,Bacteria,Bacteroidota,Bacteroidia,Flavobacteriales,Weeksellaceae,Chryseobacterium,Chryseobacterium_rhizosphaerae
5,Bacteria,Bacteroidota,Bacteroidia,Flavobacteriales,Weeksellaceae,Chryseobacterium,Chryseobacterium_gallinarum


In [11]:
prefix_matching = {
    "Kingdom": "k__",
    "Phylum": "p__",
    "Class": "c__",
    "Order": "o__",
    "Family": "f__",
    "Genus": "g__",
    "Species": "s__",
}

tax_df = pd.DataFrame(index=taxonomy_mapping.index)
tax_df["Taxon"] = taxonomy_mapping.apply(
    lambda x: "; ".join(
        [f"{prefix_matching[k]}{v}" for k, v in x.items() if not pd.isna(v)]
    ),
    axis=1,
)
# create correct index
tax_df.index = [f"ref_mOTU_v25_{int(x):05d}" for x in tax_df.index.tolist()]
tax_df.index.name = "Feature ID"

# add unclassified
unknown_class = (
    "k__undef; p__undef; c__undef; o__undef; f__undef; g__undef; s__undef"
)
tax_df.loc["unclassified", "Taxon"] = unknown_class

no_tax_given = [x for x in galaxy_motus.columns.tolist() if x not in tax_df.index]
for x in no_tax_given:
    tax_df.loc[x, "Taxon"] = unknown_class
tax_df.head()

Unnamed: 0_level_0,Taxon
Feature ID,Unnamed: 1_level_1
ref_mOTU_v25_00001,k__Bacteria; p__Spirochaetota; c__Leptospirae;...
ref_mOTU_v25_00002,k__Bacteria; p__Spirochaetota; c__Leptospirae;...
ref_mOTU_v25_00003,k__Bacteria; p__Spirochaetota; c__Leptospirae;...
ref_mOTU_v25_00004,k__Bacteria; p__Bacteroidota; c__Bacteroidia; ...
ref_mOTU_v25_00005,k__Bacteria; p__Bacteroidota; c__Bacteroidia; ...


In [12]:
# save to disk
tax_art = q2.Artifact.import_data("FeatureData[Taxonomy]", tax_df)
tax_art.save(f"{path_to_data}/u3_taxonomy.qza")

'../../data/u3_mlp_nishijima24/u3_taxonomy.qza'

No phylogeny tree can be constructed since we do not have the nucleotide sequences of these mOTUs -> no trac trainable

## Create Metacardis dataset

### Metadata

In [13]:
metacardis_md = pd.read_csv(
    f"{path_to_data}/MetaCardis_load.tsv", sep="\t", index_col=0
)

# according to publication perform log10 transformation
metacardis_md["count_log10"] = np.log10(metacardis_md["count"])

print(metacardis_md.shape)
metacardis_md.head()

(1812, 3)


Unnamed: 0_level_0,count,cohort,count_log10
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M0x10MCx1134,196108000000.0,Severe/morbid obesity,11.292495
M0x10MCx1135,143275000000.0,Type II diabetes,11.15617
M0x10MCx1138,72625700000.0,Type II diabetes,10.86109
M0x10MCx1140,71153850000.0,Severe/morbid obesity,10.852198
M0x10MCx1143,36033520000.0,Type II diabetes,10.556707


In [14]:
# save to disk
metacardis_md.to_csv(f"{path_to_data}/md_metacardis.tsv", sep="\t")

### Feature table

In [15]:
metacardis_motus = process_feature_table(path_to_data, "MetaCardis_mOTUs_v25")
print(metacardis_motus.shape)

# save to disk
metacardis_motus.to_csv(f"{path_to_data}/metacardis_otu_table.tsv", sep="\t")

metacardis_motus.head()

Original shape (1812, 14213)
(1812, 14213)


Unnamed: 0,ref_mOTU_v25_10354,unclassified,ref_mOTU_v25_04788,ref_mOTU_v25_03694,ref_mOTU_v25_06702,ref_mOTU_v25_10941,ref_mOTU_v25_10763,ref_mOTU_v25_06703,ref_mOTU_v25_10828,ref_mOTU_v25_04651,...,ref_mOTU_v25_08216,ref_mOTU_v25_03508,ref_mOTU_v25_08217,ref_mOTU_v25_08215,ref_mOTU_v25_07201,ref_mOTU_v25_10802,ref_mOTU_v25_10650,ref_mOTU_v25_11363,ref_mOTU_v25_07158,ref_mOTU_v25_07159
M0x10MCx1134,0,0.035435,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M0x10MCx1135,0,0.032264,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M0x10MCx1138,0,0.027916,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M0x10MCx1140,0,0.042573,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M0x10MCx1143,0,0.02291,0.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# are they relative abundances?
assert metacardis_motus.sum(axis=1).round(4).eq(1.0).all()

In [17]:
# check are all sample IDs present in metadata?
assert len([x for x in metacardis_motus.index if x not in metacardis_md.index]) == 0
assert len([x for x in metacardis_md.index if x not in metacardis_motus.index]) == 0

In [18]:
set([len(x) for x in metacardis_motus.columns.tolist()])

{12, 18, 19}

### Taxonomy

was already processed above as `tax_art` - same mapping can be used for both datasets.

No phylogeny tree can be constructed since we do not have the nucleotide sequences of these mOTUs -> no trac trainable