# Notebook Setup

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import sqlite3
import seaborn as sns
import patsy
import statsmodels.formula.api as sm
from sklearn.linear_model import LassoCV, LassoLarsCV, Lasso
from sklearn.model_selection import cross_val_predict
from lifelines import KaplanMeierFitter
from matplotlib.ticker import StrMethodFormatter
from statsmodels.stats.multitest import fdrcorrection
import itertools

import matplotlib as mpl

import rpy2.ipython
%load_ext rpy2.ipython.rmagic

from scripts.lib.stats import raise_low, lrt_phreg, phreg_aic, mannwhitneyu
from scripts.lib.plotting import boxplot_with_points, load_style, residuals_plot
from skbio.diversity.alpha import chao1, simpson_e
from skbio.stats import subsample_counts
from skbio import DistanceMatrix
from skbio.stats.ordination import pcoa

np.random.seed(1)

concat = lambda list_of_lists: list(itertools.chain(*list_of_lists))
richness = lambda x: (x > 0).sum()
incidence = lambda x: (x > 0).mean()

In [None]:
loaded_style = load_style('paper')

color_map = loaded_style['color_map']
mark_map = loaded_style['mark_map']
assign_significance_symbol = loaded_style['assign_significance_symbol']

In [None]:
from scripts.lib.data import load_data
loaded_data = load_data('res/C2013.results.db')
gl = globals()
gl.update(loaded_data)

print(loaded_data.keys())

In [None]:
# Working with data in which we've dropped dens = nan results

meta = (mouse[lambda x: x.cohort == 'C2013']
            .join(conc)
            .dropna(subset=['dens'])
       ).sample(frac=1)  # Randomize sample order
miceA = meta.index
taxa_details = rabund.loc[meta.index].apply(lambda x: pd.Series({'mean_abund': np.mean(x),
                                                                 'incidence': incidence(x)})).T
cdata = count.loc[miceA]
rdata = rabund.loc[miceA]
adata = abund.loc[miceA]

# DeSeq2

In [None]:
%%R -i meta -i cdata -o res_treatment -o res_sex -o res_interact

library('DESeq2')

meta$site = factor(meta$site)
meta$treatment = factor(meta$treatment)
meta$treatment = relevel(meta$treatment, 'control')

dd <- DESeqDataSetFromMatrix(countData=t(cdata),
                     colData=meta,
                     design=~treatment * sex + site)
ddr <- DESeq(dd, test="Wald", fitType="parametric", quiet=TRUE)
print(resultsNames(ddr))
res_treatment <- as(results(ddr, cooksCutoff = FALSE, contrast=c('treatment', 'acarbose', 'control')), 'data.frame')
res_sex <- as(results(ddr, cooksCutoff = FALSE, contrast=c('sex', 'male', 'female')), 'data.frame')
res_interact <- as(results(ddr, cooksCutoff = FALSE), 'data.frame')

In [None]:
for res in [res_treatment, res_sex, res_interact]:
    res.set_index(taxa_details.index, inplace=True)


taxa_details['treatment_effect'] = res_treatment.log2FoldChange
taxa_details['treatment_p'] = res_treatment.pvalue
taxa_details['sex_effect'] = res_sex.log2FoldChange
taxa_details['sex_p'] = res_sex.pvalue
taxa_details['interact_effect'] = res_interact.log2FoldChange
taxa_details['interact_p'] = res_interact.pvalue

result = taxa_details.loc[taxa_details[lambda x: (x.mean_abund > 0.0001)
                                                & (x.incidence > 0.05)].index]
result['treatment_padj'] = fdrcorrection(result.treatment_p)[1]
result['sex_padj'] = fdrcorrection(result.sex_p)[1]
result['interact_padj'] = fdrcorrection(result.interact_p)[1]
result = result.join(taxonomy)
result.genus = result.genus.apply(lambda x: '' if x is None else x)

result.shape

In [None]:
print("OTUs affect by treatment")
d = rabund.loc[:,result[lambda x: x.treatment_padj < 0.05].index]
print(d.shape[1])
print(d.sum(axis=1).median())

In [None]:
print("OTUs affect by sex")
d = rabund.loc[:,result[lambda x: x.sex_padj < 0.05].index]
print(d.shape[1])
print(d.sum(axis=1).median())

In [None]:
print("OTUs affect by sex-by-treatment interaction")
d = rabund.loc[:,result[lambda x: x.interact_padj < 0.05].index]
print(d.shape[1])
print(d.sum(axis=1).median())

In [None]:
sns.stripplot('sex', 'Otu0004', hue='treatment', data=rdata.join(meta)[meta.site=='UT'], split=True, jitter=True)
plt.yscale('symlog', linthreshy=0.01)

result.loc[['Otu0001', 'Otu0004']].T

# LASSO

In [None]:
# Are there any particularly influential outliers?

_data = meta.join(abund).dropna(subset=['butyrate'])
_data['is_outlier'] = False
#_data['is_outlier'].loc['JLc0836'] = True

sns.pairplot(_data,
             vars=['acetate', 'butyrate', 'lactate', 'propionate', 'Otu0001'],
             hue='is_outlier')

In [None]:
_data = (meta
             .join(adata)       # Use estimated absolute abundance instead of just relative.
             .dropna(subset=['Otu0001', 'butyrate'])
#             .drop('JLc0836')   # Drop the weirdo outlier
             .sample(frac=1.0)  # Shuffle order for CV purposes
        )

feat = 'propionate'

fit = sm.ols('np.log(raise_low({})) ~ treatment * sex * site'.format(feat), data=_data).fit()
resid = fit.resid

top_taxa = taxa_details[lambda x: (x.mean_abund > 0.0001)
                                  & (x.incidence > 0.05)].index

lasso_cv = LassoCV(cv=10)
lasso_cv.fit(_data.loc[resid.index, top_taxa], resid)
print(lasso_cv.alpha_)

lasso = Lasso(alpha=lasso_cv.alpha_)
pred = cross_val_predict(lasso, _data.loc[resid.index, top_taxa], resid, cv=10)
plt.scatter(pred, resid)
print(sp.stats.spearmanr(pred, resid))

result[feat] = pd.Series(lasso_cv.coef_, index=top_taxa, name=feat)

(taxonomy.join(pd.Series(lasso_cv.coef_, index=top_taxa, name='coef'))
         .dropna(subset=['coef'])
         .sort_values('coef', ascending=False)
         [lambda x: x.coef != 0]
    )

In [None]:
_data = (meta
             .join(adata)       # Use estimated absolute abundance instead of just relative.
             .dropna(subset=['Otu0001', 'butyrate'])
#             .drop('JLc0836')   # Drop the weirdo outlier
             .sample(frac=1.0)  # Shuffle order for CV purposes
        )

feat = 'lactate'

fit = sm.ols('np.log(raise_low({})) ~ treatment * sex * site'.format(feat), data=_data).fit()
resid = fit.resid

top_taxa = taxa_details[lambda x: (x.mean_abund > 0.0001)
                                  & (x.incidence > 0.05)].index

lasso_cv = LassoCV(cv=10)
lasso_cv.fit(_data.loc[resid.index, top_taxa], resid)
print(lasso_cv.alpha_)

lasso = Lasso(alpha=lasso_cv.alpha_)
pred = cross_val_predict(lasso, _data.loc[resid.index, top_taxa], resid, cv=10)
plt.scatter(pred, resid)
print(sp.stats.spearmanr(pred, resid))

result[feat] = pd.Series(lasso_cv.coef_, index=top_taxa, name=feat)

(taxonomy.join(pd.Series(lasso_cv.coef_, index=top_taxa, name='coef'))
         .dropna(subset=['coef'])
         .sort_values('coef', ascending=False)
         [lambda x: x.coef != 0]
    )



In [None]:
_data = (meta
             .join(adata)       # Use estimated absolute abundance instead of just relative.
             .dropna(subset=['Otu0001', 'butyrate'])
#             .drop('JLc0836')   # Drop the weirdo outlier
             .sample(frac=1.0)  # Shuffle order for CV purposes
        )

feat = 'butyrate'

fit = sm.ols('np.log(raise_low({})) ~ treatment * sex * site'.format(feat), data=_data).fit()
resid = fit.resid

top_taxa = taxa_details[lambda x: (x.mean_abund > 0.0001)
                                  & (x.incidence > 0.05)].index

lasso_cv = LassoCV(cv=10)
lasso_cv.fit(_data.loc[resid.index, top_taxa], resid)
print(lasso_cv.alpha_)

lasso = Lasso(alpha=lasso_cv.alpha_)
pred = cross_val_predict(lasso, _data.loc[resid.index, top_taxa], resid, cv=10)
plt.scatter(pred, resid)
print(sp.stats.spearmanr(pred, resid))

result[feat] = pd.Series(lasso_cv.coef_, index=top_taxa, name=feat)

(taxonomy.join(pd.Series(lasso_cv.coef_, index=top_taxa, name='coef'))
         .dropna(subset=['coef'])
         .sort_values('coef', ascending=False)
         [lambda x: x.coef != 0]
    )

In [None]:
_data = (meta
             .join(adata)       # Use estimated absolute abundance instead of just relative.
             .dropna(subset=['Otu0001', 'butyrate'])
#             .drop('JLc0836')   # Drop the weirdo outlier
             .sample(frac=1.0)  # Shuffle order for CV purposes
        )

feat = 'acetate'

fit = sm.ols('np.log(raise_low({})) ~ treatment * sex * site'.format(feat), data=_data).fit()
resid = fit.resid

top_taxa = taxa_details[lambda x: (x.mean_abund > 0.0001)
                                  & (x.incidence > 0.05)].index

lasso_cv = LassoCV(cv=10)
lasso_cv.fit(_data.loc[resid.index, top_taxa], resid)
print(lasso_cv.alpha_)

lasso = Lasso(alpha=lasso_cv.alpha_)
pred = cross_val_predict(lasso, _data.loc[resid.index, top_taxa], resid, cv=10)
plt.scatter(pred, resid)
print(sp.stats.spearmanr(pred, resid))

result[feat] = pd.Series(lasso_cv.coef_, index=top_taxa, name=feat)

(taxonomy.join(pd.Series(lasso_cv.coef_, index=top_taxa, name='coef'))
         .dropna(subset=['coef'])
         .sort_values('coef', ascending=False)
         [lambda x: x.coef != 0]
    )

# Export

In [None]:
result['log_mean_abund'] = np.log10(result.mean_abund)

output_columns = ['incidence', 'mean_abund', 'log_mean_abund',
        'treatment_effect', 'treatment_padj',
        'sex_effect', 'sex_padj',
        'interact_effect', 'interact_padj',
        'propionate', 'butyrate', 'acetate', 'lactate',
        'phylum', 'class', 'order', 'family', 'genus']


# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('build/otu_details.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.
result[output_columns].to_excel(writer, sheet_name='Sheet1')

# Get the xlsxwriter workbook and worksheet objects.
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

# Add comments to column headers
worksheet.write_comment('B1', 'Fraction of all samples with at least one read assigned to the OTU')
worksheet.write_comment('C1', 'Mean relative abundance')
worksheet.write_comment('D1', 'Log-base-10 of mean relative abundance')
worksheet.write_comment('E1', 'Coefficient on treatment term in DESeq2 analysis')
worksheet.write_comment('G1', 'Coefficient on sex term in DESeq2 analysis')
worksheet.write_comment('I1', 'Coefficient on sex-by-treatment term in DESeq2 analysis')
worksheet.write_comment('K1', 'Coefficient on OTU term in propionate LASSO analysis')
worksheet.write_comment('L1', 'Coefficient on OTU term in butyrate LASSO analysis')
worksheet.write_comment('M1', 'Coefficient on OTU term in acetate LASSO analysis')
worksheet.write_comment('N1', 'Coefficient on OTU term in lactate LASSO analysis')

# Format incidence column
worksheet.set_column('B:B', None, workbook.add_format({'num_format': '0%'}))
worksheet.conditional_format('B1:B999', {'type': 'data_bar', 'min_value': 0, 'max_value': 1,
                                         'bar_color': '#555555'})

# Format mean abundance column
worksheet.set_column('C:C', None, workbook.add_format({'num_format': '0.0E+0'}))

# Format transformed mean abundance column
worksheet.set_column('D:D', None, workbook.add_format({'num_format': '0.0'}))
worksheet.conditional_format('D1:D999', {'type': '2_color_scale',
                                         'min_color': '#ffffff', 'max_color': '#ffe46d'})

# Format DeSeq2 coefficient columns
deseq_effect_fmt = workbook.add_format({'num_format': '0.0'})
worksheet.set_column('E:E', None, deseq_effect_fmt)
worksheet.set_column('G:G', None, deseq_effect_fmt)
worksheet.set_column('I:I', None, deseq_effect_fmt)

# Format LASSO coefficient columns
lasso_effect_fmt = workbook.add_format({'num_format': '0.000000'})
worksheet.set_column('K:N', None, lasso_effect_fmt)

# Format DeSeq2 p-values
pvalue_fmt = {'type': 'icon_set',
     'icon_style': '3_traffic_lights',
     'icons': [{'criteria': '<', 'type': 'number', 'value': 0.05},
               {'criteria': '<', 'type': 'number', 'value': 0.001}]}
worksheet.conditional_format('F1:F999', pvalue_fmt)
worksheet.conditional_format('H1:H999', pvalue_fmt)
worksheet.conditional_format('J1:J999', pvalue_fmt)
# Narrow the cell width
worksheet.set_column('F:F', 2)
worksheet.set_column('H:H', 2)
worksheet.set_column('J:J', 2)



# Format effect sizes
effect_fmt = {'type': '3_color_scale',
                      'min_color': '#7787d6',
                      'min_type': 'percentile',
                      'min_value': 0,
                      'mid_color': '#ffffff',
                      'mid_type': 'percentile',
                      'mid_value': 50,
                      'max_color': '#ffe46d',
                      'max_type': 'percentile',
                      'max_value': 100,
                      }
worksheet.conditional_format('E1:E999', effect_fmt)
worksheet.conditional_format('G1:G999', effect_fmt)
worksheet.conditional_format('I1:I999', effect_fmt)
#worksheet.conditional_format('K1:K999', effect_fmt)
worksheet.conditional_format('L1:L999', effect_fmt)
worksheet.conditional_format('M1:M999', effect_fmt)
worksheet.conditional_format('N1:N999', effect_fmt)

# Special formatting for propionate (since minimum is 0)
assert result['propionate'].min() >= 0
worksheet.conditional_format('K1:K999',
                     {'type': '2_color_scale',
                      'min_color': '#ffffff',
                      'min_type': 'percentile',
                      'min_value': 0,
                      'max_color': '#ffe46d',
                      'max_type': 'percentile',
                      'max_value': 100,
                      })

worksheet.autofilter(0, 0, *result[output_columns].shape)

# Close the Pandas Excel writer and output the Excel file.
writer.save()