In [2]:
import os
import pandas as pd
import qiime2 as q2
import requests

from qiime2 import Visualization

data_dir = 'poop_data/Metagenomics'
    
%matplotlib inline

In [22]:
def fetch_ipath(ids: list, img_output_path: str, verbose: bool = False):
    """Fetches a enriched pathways map from iPATH3 for given IDs."""
    url = 'https://pathways.embl.de/mapping.cgi'
    
    # remove colon from EC names
    if ':' in ids[0]:
        ids = [x.replace(':', '') for x in ids]
    
    if verbose:
        print(f'Fetching iPATH3 diagram for ids: {ids}')
    params = {
        'default_opacity': 0.6,
        'export_type': 'svg',
        'selection': '\n'.join(ids)
    }   
    response = requests.get(url=url, params=params)
    
    with open(img_output_path, 'wb') as img:
        img.write(response.content)

In [3]:
# path to the picrust2 conda environment - do not change!
picrust_env = '/opt/conda/envs/picrust2/bin'

In [4]:
%%script env picrust_env="$picrust_env" data_dir="$data_dir" bash

# append the env location to PATH so that qiime
# can find all required executables
export PATH=$picrust_env:$PATH

$picrust_env/qiime picrust2 full-pipeline \
    --i-seq $'poop_data/Taxonomy'/rep-seqs-filtered_new.qza \
    --i-table $'poop_data/Taxonomy'/table-filtered_new.qza \
    --output-dir $data_dir/picrust2_results \
    --p-placement-tool sepp \
    --p-threads 2 \
    --p-hsp-method pic \
    --p-max-nsti 2 

QIIME is caching your current deployment for improved performance. This may take a few moments and should only happen once per deployment.


Saved FeatureTable[Frequency] to: poop_data/Metagenomics/picrust2_results/ko_metagenome.qza
Saved FeatureTable[Frequency] to: poop_data/Metagenomics/picrust2_results/ec_metagenome.qza
Saved FeatureTable[Frequency] to: poop_data/Metagenomics/picrust2_results/pathway_abundance.qza


In [10]:
ko = q2.Artifact.load(f'{data_dir}/picrust2_results/ko_metagenome.qza').view(pd.DataFrame)
ec = q2.Artifact.load(f'{data_dir}/picrust2_results/ec_metagenome.qza').view(pd.DataFrame)
pa = q2.Artifact.load(f'{data_dir}/picrust2_results/pathway_abundance.qza').view(pd.DataFrame)

In [4]:
ko.head()

Unnamed: 0,K00001,K00002,K00003,K00004,K00005,K00007,K00008,K00009,K00010,K00011,...,K19777,K19778,K19779,K19780,K19784,K19785,K19787,K19788,K19789,K19791
10317.000002929,5142.037624,140.257551,8244.813771,260.071999,6475.56947,0.000155,7614.452333,1306.043795,4242.991858,0.0,...,0.0,0.0,0.0,0.0,142.764138,0.0,0.0,0.0,318.690457,0.0
10317.00000293,2563.326126,200.321778,3945.306551,418.262647,2311.037174,12.167427,3720.771736,908.086201,2010.539789,1e-06,...,1.961244e-13,2.599677e-12,1.84189e-15,2.480888e-15,426.041781,0.0,0.0,0.0,248.319698,0.111276
10317.000003283,5190.805498,58.909884,4097.621029,468.584498,4336.160197,6.093604,5776.90503,2098.635538,3172.779917,0.0,...,1363.63,1363.63,272.0552,272.5722,1590.939135,0.0,0.0,0.0,1642.154746,0.0
10317.000004079,3883.610863,28.50543,8046.769281,460.999838,7298.148013,79.780128,5404.44952,1809.185532,2847.144885,0.001873,...,22.87358,2.94,0.5865537,0.5876683,197.758132,0.0,0.0,0.0,266.304296,0.005843
10317.000016169,3416.63877,164.936054,4919.361142,155.497359,3372.75469,0.012271,4126.667736,1607.9822,3702.155707,0.0,...,176.26,176.26,35.16529,35.23212,351.099496,0.0,0.0,0.0,307.766296,0.0


In [11]:
metadata = pd.read_csv(f'poop_data/metadata.tsv', sep='\t', header=0, index_col=0)

In [12]:
metadata.index = metadata.index.astype(str)

In [13]:
ko_meta = ko.merge(metadata[['HEA_ibd']], left_index=True, right_index=True)
ec_meta = ec.merge(metadata[['HEA_ibd']], left_index=True, right_index=True)
pa_meta = pa.merge(metadata[['HEA_ibd']], left_index=True, right_index=True)

In [7]:
metadata.sort_values('GEN_dog')


Unnamed: 0_level_0,GEN_age_cat,GEN_age_corrected,GEN_bmi_cat,GEN_bmi_corrected,GEN_cat,GEN_collection_timestamp,GEN_country,GEN_dog,GEN_elevation,GEN_geo_loc_name,...,HEA_lung_disease,HEA_mental_illness,HEA_migraine,HEA_seasonal_allergies,HEA_sibo,HEA_skin_condition,HEA_sleep_duration,HEA_smoking_frequency,HEA_thyroid,HEA_weight_change
sampleid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10317.000050,50s,59.0,Overweight,29.98,False,2016-06-27 09:00:00,United Kingdom,False,33.1,United Kingdom:England,...,False,False,False,False,True,False,5-6 hours,Never,False,Increased more than 10 pounds
10317.000053,60s,61.0,Overweight,29.44,False,2016-08-05 10:20:00,United Kingdom,False,163.2,United Kingdom:England,...,True,False,False,False,False,False,7-8 hours,Never,False,Remained stable
10317.000053,40s,47.0,Obese,32.18,False,2016-08-06 08:05:00,United Kingdom,False,172.6,United Kingdom:Wales,...,False,False,False,False,False,False,7-8 hours,Never,False,Remained stable
10317.000027,50s,54.0,Overweight,26.61,False,2016-01-25 09:00:00,USA,False,25.8,USA:VA,...,False,False,False,False,False,False,7-8 hours,Never,False,Decreased more than 10 pounds
10317.000053,60s,64.0,Normal,19.21,False,2016-07-26 15:30:00,United Kingdom,False,79.5,United Kingdom:England,...,False,False,False,False,True,False,7-8 hours,Never,False,Remained stable
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10317.000043,Not provided,,Underweight,14.79,False,2016-04-23 10:15:00,United Kingdom,True,6.4,United Kingdom:Unspecified,...,False,False,True,True,False,False,8 or more hours,Not provided,False,Remained stable
10317.000054,60s,61.0,Overweight,28.12,False,2016-07-28 08:20:00,United Kingdom,True,104.3,United Kingdom:England,...,False,True,True,True,False,False,7-8 hours,Never,False,Remained stable
10317.000046,50s,54.0,Normal,22.46,False,2016-07-26 05:00:00,USA,True,83.6,USA:MA,...,True,False,False,True,False,True,6-7 hours,Never,False,Remained stable
10317.000051,30s,31.0,Normal,20.37,False,2016-07-06 18:15:00,USA,True,34.9,USA:PA,...,False,True,False,True,False,True,7-8 hours,Never,False,Remained stable


In [14]:
# collapse samples per sample_type - calculate average abundance

ko_meta_avg = ko_meta.groupby('HEA_ibd').mean()
ec_meta_avg = ec_meta.groupby('HEA_ibd').mean()
pa_meta_avg = pa_meta.groupby('HEA_ibd').mean()

In [15]:
ko_meta_avg.head()

Unnamed: 0_level_0,K00001,K00002,K00003,K00004,K00005,K00007,K00008,K00009,K00010,K00011,...,K19777,K19778,K19779,K19780,K19784,K19785,K19787,K19788,K19789,K19791
HEA_ibd,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
False,3218.840292,72.392216,4957.599135,393.304786,3924.407475,55.034151,4547.619892,1405.649248,2663.206223,0.789319,...,699.343784,600.732319,102.443945,117.213335,1075.281444,6e-06,5.768177e-08,0.001996791,927.353273,0.005656
True,3345.947811,63.304153,4401.595549,431.745587,3717.662482,90.632875,3915.028907,1572.999386,2615.062583,0.271169,...,597.76817,695.290508,102.524699,102.716064,1252.307983,2.5e-05,5.982661e-07,3.9055959999999996e-57,1046.515287,0.002014


In [16]:
# find top x% of the most abundant KOs, ECs and pathways in each sample type

def find_most_abundant(df: pd.DataFrame, frac):
    if 0 < frac < 1:
        frac = int(frac * len(df.columns))
    print(f'Saving {frac} most abundant features...')
    most_abundant = {
        smp: df.loc[smp, :].sort_values(ascending=False)[:frac]
        for smp in df.index
    }
    return most_abundant

ko_most_abundant = find_most_abundant(ko_meta_avg, 0.01)
ec_most_abundant = find_most_abundant(ec_meta_avg, 0.03)
pa_most_abundant = find_most_abundant(pa_meta_avg, 5)

Saving 101 most abundant features...
Saving 85 most abundant features...
Saving 5 most abundant features...


In [19]:
ko_most_abundant

{False: K03088    44633.067968
 K01990    38923.179052
 K02004    38462.180223
 K06147    38038.284858
 K01992    37872.265057
               ...     
 K00945     9618.599847
 K06131     9608.866493
 K01153     9550.129203
 K07053     9492.372542
 K02032     9464.503162
 Length: 101, dtype: float64,
 True: K03088    38664.931473
 K01990    34185.787550
 K02004    34134.973744
 K01992    34008.205506
 K06147    32060.938588
               ...     
 K00655     8782.604054
 K02470     8761.470416
 K12373     8755.799267
 K06131     8650.027008
 K01265     8645.158939
 Name: True, Length: 101, dtype: float64}

In [20]:
print(f'10 most abundant KOs in the treatment group are: {ko_most_abundant[True].index[:10].tolist()}\n'
      f'10 most abundant KOs in the non-treatment group are: {ko_most_abundant[False].index[:10].tolist()}\n')

10 most abundant KOs in the treatment group are: ['K03088', 'K01990', 'K02004', 'K01992', 'K06147', 'K02003', 'K02529', 'K07024', 'K02015', 'K00059']
10 most abundant KOs in the non-treatment group are: ['K03088', 'K01990', 'K02004', 'K06147', 'K01992', 'K02003', 'K02529', 'K07024', 'K03497', 'K02015']



In [18]:
print(f'4 most abundant pathways in the treatment group are: {pa_most_abundant[True].index[:4].tolist()}\n'
      f'4 most abundant pathways in the non-treatment group are: {pa_most_abundant[False].index[:4].tolist()}\n')

4 most abundant pathways in the treatment group are: ['NONOXIPENT-PWY', 'PWY-7111', 'PWY-5101', 'PWY-7663']
4 most abundant pathways in the non-treatment group are: ['NONOXIPENT-PWY', 'PWY-7111', 'PWY-5101', 'PWY-7663']



In [23]:
for smp in ko_most_abundant.keys():
    fetch_ipath(ko_most_abundant[smp].index.tolist(), f'{data_dir}/kos_{smp}.svg')
    fetch_ipath(ec_most_abundant[smp].index.str.replace(':', '').tolist(), f'{data_dir}/ecs_{smp}.svg')

In [24]:
! qiime composition add-pseudocount \
    --i-table $data_dir/picrust2_results/pathway_abundance.qza \
    --o-composition-table $data_dir/picrust2_results/pathway_abundance_abund.qza

[32mSaved FeatureTable[Composition] to: poop_data/Metagenomics/picrust2_results/pathway_abundance_abund.qza[0m
[0m

In [27]:
! qiime composition ancom \
    --i-table $data_dir/picrust2_results/pathway_abundance_abund.qza \
    --m-metadata-file $'poop_data/metadata.tsv' \
    --m-metadata-column HEA_ibd \
    --p-transform-function log \
    --o-visualization $data_dir/pa_ancom_ibd.qzv

[32mSaved Visualization to: poop_data/Metagenomics/pa_ancom_ibd.qzv[0m
[0m

In [28]:
Visualization.load(f'{data_dir}/pa_ancom_ibd.qzv')