# Notebook Setup

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import sqlite3
import seaborn as sns
import patsy
from sklearn.decomposition import PCA
from lifelines import KaplanMeierFitter
from matplotlib.ticker import StrMethodFormatter
from statsmodels.stats.multitest import fdrcorrection

import rpy2.ipython
%load_ext rpy2.ipython.rmagic

from scripts.lib.stats import raise_low, lrt_phreg, phreg_aic
from scripts.lib.plotting import boxplot_with_points, load_style

In [None]:
%%R

library('survival')
library("survminer")

stnd = function(x){
    return((x - mean(x)) / sd(x))
    }

In [None]:
loaded_style = load_style('paper')

color_map = loaded_style['color_map']
mark_map = loaded_style['mark_map']
assign_significance_symbol = loaded_style['assign_significance_symbol']
savefig = loaded_style['savefig']
fullwidth = loaded_style['fullwidth']
halfwidth = loaded_style['halfwidth']

In [None]:
from scripts.lib.data import load_data
loaded_data = load_data('res/C2013.results.db')
gl = globals()
gl.update(loaded_data)

print(loaded_data.keys())

# Comparison to the full population.

# Null model

In [None]:
data = (mouse[  mouse.cohort.isin(['C2013'])
              & mouse.treatment.isin(['control', 'acarbose'])
              & mouse.site.isin(['UM', 'UT', 'TJL'])
             ]
             .join(conc)
             .dropna(subset=['age_at_death_or_censor', 'age_at_collect', 'butyrate'])
            [['cohort', 'sex', 'treatment', 'site',
              'age_at_collect', 'age_at_death_or_censor', 'dead',
              'propionate', 'butyrate', 'acetate']]
#             .sample(frac=1, replace=True)  # If I want to check a few bootstrap samples...
       )
data.dead = data.dead.astype(int)

data.groupby(['cohort', 'site', 'sex', 'treatment']).age_at_death_or_censor.apply(lambda x: x.notnull().count())

In [None]:
%%R -i data

data$treatment = factor(data$treatment, levels=c('control', 'acarbose'))
data$sex = factor(data$sex, levels=c('male', 'female'))
data$site = factor(data$site, levels=c('UT', 'UM'))

fit1 = coxph(Surv(age_at_collect, age_at_death_or_censor, data$dead) ~
                 treatment * sex * site,
            data=data)

print(fit1)
print(anova(fit1))

The survival data we collected trends towards many of the known features of survival in the ITP study.
-   Acarbose trends towards increasing longevity in both male and female mice (less so in the latter).
-   Female control mice live longer than male mice at UT, but not at UM. (In our control sample, male mice at UM live longer than their female counterparts.)
-   Control, male mice live longer at UM than at UT (this effect is significant).

The significance of treatment sex and site are all significant or nearly so.

# SCFAs as predictors

## Analysis of individual SCFAs

In [None]:
%%R

fit_propionate = coxph(Surv(age_at_collect, age_at_death_or_censor, data$dead) ~
                 stnd(propionate)
                 + treatment * sex * site,
            data=data)

summary(fit_propionate)

In [None]:
%%R

fit_butyrate = coxph(Surv(age_at_collect, age_at_death_or_censor, data$dead) ~
                 stnd(butyrate)
                 + treatment * sex * site,
            data=data)

summary(fit_butyrate)

In [None]:
%%R

fit_acetate = coxph(Surv(age_at_collect, age_at_death_or_censor, data$dead) ~
                 stnd(acetate)
                 + treatment * sex * site,
            data=data)

summary(fit_acetate)

## Joint SCFA Analysis

In [None]:
%%R

fit2 = coxph(Surv(age_at_collect, age_at_death_or_censor, data$dead) ~
                 stnd(propionate) + stnd(butyrate) + stnd(acetate)
                 + treatment * sex * site,
            data=data)

summary(fit2)

Notice that the acarbose effect disappeared (consistent with an effect mediated by SCFAs).

In [None]:
%%R

anova(fit2)

In [None]:
%%R

anova(fit1, fit2)

Including SCFA information improves the fit of the model by a likelihood ratio test.

In [None]:
%%R

# Can we make a better null model and then compare the SCFA-included model to that one?
# What about if we use pseudo-iterative regression to make both models as good as possible?

fit_minimal_design = coxph(Surv(age_at_collect, age_at_death_or_censor, data$dead) ~
                 site * sex + treatment,
                    data=data)

fit_minimal_scfa = coxph(Surv(age_at_collect, age_at_death_or_censor, data$dead) ~
                 stnd(propionate) + stnd(butyrate) + stnd(acetate) + site * sex,
                    data=data)

AIC(fit_minimal_design, fit_minimal_scfa)

### Check Model Assumptions

In [None]:
%%R

phtest2 = cox.zph(fit2, global=TRUE)
print(phtest2)
plot(phtest2)

None of the design parameters show evidence of deviation from
proportionality assumptions.

In [None]:
%%R
fit2_coxdiag_plots = ggcoxdiagnostics(fit2, type='deviance', linear.predictions=TRUE, sline=FALSE)

png('fig/phreg_residuals.png', width=4, height=4, units='in', res=72)
print(fit2_coxdiag_plots)
dev.off()
pdf('fig/phreg_residuals.pdf', width=4, height=4)
print(fit2_coxdiag_plots)
dev.off()

print(fit2_coxdiag_plots)

#dev.copy(png,'myplot.png')
#save.plot('fig/phreg_residuals.png')

Little evidence of deviations from the linearity assumption.

### Check Extreme Points

There are two individuals for which longevity predictions are extreme (left-most and right-most points).
We can try to remove these points to check that they aren't overly influential.

In [None]:
%%R

extreme_predictions = c(which.min(predict(fit2)), which.max(predict(fit2)))
print(extreme_predictions)

In [None]:
%%R

data1 = data[-extreme_predictions,]

fit3 = coxph(Surv(age_at_collect, age_at_death_or_censor, data1$dead) ~
                 propionate + butyrate + acetate
                 + treatment * sex * site,
            data=data1)

print(fit3)

While the p-value for the acetate coefficient is no longer
less than 0.05, all coefficients are nearly identical.

 (Plots of Schoenfeld residuals.)

In [None]:
%%R

ggcoxdiagnostics(fit3, type='deviance', linear.predictions=TRUE)

No evidence of deviation from linearity assumptions, nor highly influential outliers.

### Log-transformed SCFAs

Potential lessens the effects of (not-demonstrated) non-linearity or outliers.

In [None]:
%%R

fit4 = coxph(Surv(age_at_collect, age_at_death_or_censor, data$dead) ~
                 log(propionate) + log(butyrate) + log(acetate)
                 + treatment * sex * site,
            data=data)

print(fit4)

Log-transformed SCFAs aren't predictive (except propionate)

### PCA of SCFAs

In [None]:
%%R

pca_fit = princomp(~ stnd(propionate) + stnd(butyrate) + stnd(acetate), data=data)
data2 = cbind(data, predict(pca_fit))

print(loadings(pca_fit))
plot(pca_fit)

It would appear that the concentrations of SCFAs can be described with
-   a high SCFAs component (-PC1),
-   a high propionate + lower everything else component (PC2),
-   and a high butyrate + low acetate component (-PC3).

In [None]:
%%R

fit1_pca = coxph(Surv(age_at_collect, age_at_death_or_censor, data2$dead) ~
                 Comp.1 + Comp.2 + Comp.3
                 + treatment * sex * site,
            data=data2)

print(fit1_pca)

The high total SCFA component and the high acetate components are associated with
decreased longevity, while the high propionate component is associated with increased
survival.

Is this a worthwhile analysis?  Probably not.

# Effect Size in Context?

In [None]:
d = data[data.sex.isin(['male']) & data.treatment.isin(['acarbose']) & data.site.isin(['UM'])]
mouseA, colorA = 'UM45080', 'darkred'
mouseB, colorB = 'UM46249', 'darkblue'

fig, axs = plt.subplots(1, 2, figsize=(6, 2.5))

art = axs[1].scatter('butyrate', 'propionate', c='acetate', data=d, edgecolor='black', linewidth=0.5, cmap='gray_r')
fig.colorbar(art, label='acetate')
axs[1].set_xlabel('butyrate')
axs[1].set_ylabel('propionate')
axs[1].scatter([d.butyrate[mouseA]], [d.propionate[mouseA]], marker='o',
               color=colorA, s=100, lw=0.75, label='Mouse A', facecolors='none')
axs[1].scatter([d.butyrate[mouseB]], [d.propionate[mouseB]], marker='o',
               color=colorB, s=100, lw=0.75, label='Mouse B', facecolors='none')

d_subset_data = d.loc[[mouseA, mouseB]]

en, ex = patsy.dmatrices(('age_at_death_or_censor ~ 1'
                               ' + C(treatment, Treatment("control"))'
                               ' * C(sex, Treatment("female"))'
                               ' * C(site, Treatment("UM"))'
                               ' + propionate '
                               ' + butyrate'
                               ' + acetate'
                              ),
                              data=data, NA_action='raise')
endog_di, exog_di = en.design_info, ex.design_info
ex = patsy.build_design_matrices([exog_di], data=data,
                                 return_type='dataframe',
                                 NA_action='raise')[0].drop('Intercept', axis='columns')


fit = sm.PHReg(en, ex, status=data.dead, entry=data.age_at_collect).fit()


design = patsy.build_design_matrices([exog_di], data=d_subset_data,
                                     return_type='dataframe',
                                     NA_action='raise')[0].drop('Intercept', axis='columns')


mouseA_logHR = fit.predict(exog=design).predicted_values[0]
mouseA_logHR_min = mouseA_logHR + 1 * fit.predict(exog=design).standard_errors[0]
mouseA_logHR_max = mouseA_logHR - 1 * fit.predict(exog=design).standard_errors[0]

mouseB_logHR = fit.predict(exog=design).predicted_values[1]
mouseB_logHR_min = mouseB_logHR + 1 * fit.predict(exog=design).standard_errors[1]
mouseB_logHR_max = mouseB_logHR - 1 * fit.predict(exog=design).standard_errors[1]



times, hazards, survival = fit.baseline_cumulative_hazard[0]
assert np.all(hazards == fit.baseline_cumulative_hazard_function[0](times))

hazard_to_survival = lambda hazard: np.exp(-hazard.cumsum())
axs[0].plot(times, hazard_to_survival(hazards * np.exp(mouseA_logHR)),
            color=colorA, drawstyle='steps')
#axs[0].fill_between(times,
#                    hazard_to_survival(hazards * np.exp(mouseA_logHR_min)),
#                    hazard_to_survival(hazards * np.exp(mouseA_logHR_max)),
#                    color=colorA, alpha=0.2, step='pre')
axs[0].plot(times, hazard_to_survival(hazards * np.exp(mouseB_logHR)),
            color=colorB, drawstyle='steps')
#axs[0].fill_between(times,
#                    hazard_to_survival(hazards * np.exp(mouseB_logHR_min)),
#                    hazard_to_survival(hazards * np.exp(mouseB_logHR_max)),
#                    color=colorB, alpha=0.2, step='pre')
axs[0].set_xlabel('days')
axs[0].set_ylabel('survival')

d_subset_data[['acetate', 'butyrate', 'propionate', 'age_at_death_or_censor']]

for panel, ax in zip(['A', 'B'], axs):
    ax.annotate(panel, xy=(0.02, 1.03), xycoords='axes fraction', fontweight='heavy')


#fig.tight_layout()
savefig(fig, 'fig/survival_predict')

In [None]:
np.exp(mouseA_logHR - mouseB_logHR)

In [None]:
d_subset_data[['butyrate', 'propionate', 'acetate', 'age_at_collect', 'age_at_death_or_censor']]

In [None]:
for hr in [mouseA_logHR, mouseB_logHR]:
    print(times[np.abs(hazard_to_survival(hazards * np.exp(hr)) - 0.5).argmin()])