In [1]:
import numpy as np
import scipy as sp
import pandas as pd
from sklearn import linear_model


### meta-Poisson model
For meta cells $i=1,...,M$, let $S_i$ be the set of cells for meta cell $i$:
- Average cell state-specific ATAC: $\theta_i \sim N(0,0.25)$ (set min=-3 and max=3 for stability)
 
 For cells $c \in S_i$: 
 - True ATAC state: $\lambda_c \sim LogNormal(\theta_i, \sigma_\lambda^2)$ (to make sure $\lambda_c$ is positive; values of $\sigma_\lambda^2$ will be varied)
 - ATAC: $X_c \sim Poi(\lambda_c)$
 - RNA: $Y_c \sim Poi(\exp(\lambda_c b + N(0, 0.01)))$

Observation: 
- When $\sigma_\lambda^2$ is small, performance is good
- When $\sigma_\lambda^2$ is big, performance is bad

### method 5: Bayesian estimate of $\lambda_c$
- method 4 estimate $\lambda_c$ as $\frac{1}{|S_i|} \sum_{c' \in S_i} X_{c'}$. This may be problematic when 
$\sigma_\lambda^2$ is large. Therefore, for method 5, I propose an Bayesian estimator $E[\lambda_c | X_c]$.

Model: for a given meta cell $i$, and for $c \in S_i$
- $\lambda_c \sim Gamma(\alpha_i, \beta_i)
- $X_c \sim Poi(\lambda_c)$

Fact: $\lambda_c | X_c \sim Gamma (X_c + \alpha_i, \beta_i + 1)$. Thus, we can estimate
$\hat{\lambda}_c = \frac{X_c + \alpha_i}{\beta_i + 1}$. 

Step 1: for each meta cell, estimate $(\alpha_i, \beta_i)$.
- Let $E[X_c], var[X_c]$ be computed across all cells in meta-cell $i$.
- Since $E[X_c]=\frac{\alpha_i}{\beta_i}$ and $var[X_c] = \frac{\alpha_i}{\beta_i} (1+\frac{1}{\beta_i})$.
- We can estimate $\hat{\alpha}_i=\frac{E[X_c]^2}{var[X_c]-E[X_c]}$, $\hat{\beta}_i=\frac{E[X_c]}{var[X_c]-E[X_c]}$.

Step 2: 
- Thus, $\hat{\lambda}_c = \frac{X_c + \hat{\alpha}_i}{\hat{\beta}_i + 1} = \frac{X_c (var[X_c] - E[X_c]) + E[X_c]^2}{var[X_c]}$. 

Details: 
- Since $var[X_c]$ is supposed to be greater than $E[X_c]$, we set $var[X_c] = max(var[X_c] , E[X_c])$
- Let $w_1 = \frac{var[X_c] - E[X_c]}{var[X_c]}$ and $w_2 = \frac{E[X_c]}{var[X_c]}$
- Then $w_1+w_2=1$ and $\hat{\lambda}_c = w_1 X_cw_E[X_c]^2}{var[X_c]}$.

In [12]:
n_meta = 1000
n_cell_per_meta = 25
n_cell = n_meta * n_cell_per_meta
n_rep = 25

# for coef_b in [0, 1]:
for coef_b in [1]:
    for sigma_lambda in [0.01, 0.1, 0.2, 0.5]:
        print('====================================================')
        print('coef_b=%0.1f, sigma_lambda = %0.2e' % (coef_b, sigma_lambda))
        print('====================================================')

        dic_data = {}
        for i_rep in range(n_rep):
            np.random.seed(i_rep)
            v_theta = (np.random.randn(n_meta) * 0.5).clip(min=-3, max=3)
            v_lambda = np.zeros(n_meta*n_cell_per_meta)
            for i in range(n_meta):
                temp_v = np.exp(np.random.randn(n_cell_per_meta) * sigma_lambda + v_theta[i])
                v_lambda[i*n_cell_per_meta : (i+1)*n_cell_per_meta] = temp_v

            v_x = np.random.poisson(v_lambda)
            v_y = np.random.poisson(np.exp(v_lambda*coef_b + np.random.randn(n_cell)*0.1))
            dic_data[i_rep] = {
                'v_lambda' : v_lambda,
                'v_x' : v_x,
                'v_y' : v_y
            }

        # method 1: Poisson regression of v_y against v_x
        v_beta_hat = []
        for i_rep in range(n_rep):
            v_lambda = dic_data[i_rep]['v_lambda']
            v_x = dic_data[i_rep]['v_x']
            v_y = dic_data[i_rep]['v_y']
            clf = linear_model.PoissonRegressor()
            res = clf.fit(v_x.reshape([n_cell, 1]), v_y)
            v_beta_hat.append(clf.coef_[0])
        print('method 1: Poisson regression of v_y against v_x')
        beta_mean = np.mean(v_beta_hat)
        beta_se = np.std(v_beta_hat) / np.sqrt(n_rep)
        print('beta_mean=%0.3f, beta_se=%0.3f, beta_z=%0.3f' % (beta_mean, beta_se, beta_mean/beta_se))
        print('')

        # method 2 (oracle): Poisson regression of v_y against v_lambda
        v_beta_hat = []
        for i_rep in range(n_rep):
            v_lambda = dic_data[i_rep]['v_lambda']
            v_x = dic_data[i_rep]['v_x']
            v_y = dic_data[i_rep]['v_y']
            clf = linear_model.PoissonRegressor()
            res = clf.fit(v_lambda.reshape([n_cell, 1]), v_y)
            v_beta_hat.append(clf.coef_[0])
        print('method 2 (oracle): Poisson regression of v_y against v_lambda')
        beta_mean = np.mean(v_beta_hat)
        beta_se = np.std(v_beta_hat) / np.sqrt(n_rep)
        print('beta_mean=%0.3f, beta_se=%0.3f, beta_z=%0.3f' % (beta_mean, beta_se, beta_mean/beta_se))
        print('')

        # method 3 (oracle): Poisson regression of v_y against v_lambda_bar (meta_cell_averaged v_lambda)
        v_beta_hat = []
        for i_rep in range(n_rep):
            v_lambda = dic_data[i_rep]['v_lambda']
            v_x = dic_data[i_rep]['v_x']
            v_y = dic_data[i_rep]['v_y']
            v_lambda_bar = np.zeros(n_cell)
            for i in range(n_meta):
                v_lambda_bar[i*n_cell_per_meta : (i+1)*n_cell_per_meta] = \
                    np.mean(v_lambda[i*n_cell_per_meta : (i+1)*n_cell_per_meta])
            clf = linear_model.PoissonRegressor()
            res = clf.fit(v_lambda_bar.reshape([n_cell, 1]), v_y)
            v_beta_hat.append(clf.coef_[0])
        print('method 3 (oracle): Poisson regression of v_y against v_lambda_bar'
              ' (meta_cell_averaged v_lambda)')
        beta_mean = np.mean(v_beta_hat)
        beta_se = np.std(v_beta_hat) / np.sqrt(n_rep)
        print('beta_mean=%0.3f, beta_se=%0.3f, beta_z=%0.3f' % (beta_mean, beta_se, beta_mean/beta_se))
        print('')

        # method 4: Poisson regression of v_y against v_x_bar (meta_cell_averaged v_x)
        v_beta_hat = []
        for i_rep in range(n_rep):
            v_lambda = dic_data[i_rep]['v_lambda']
            v_x = dic_data[i_rep]['v_x']
            v_y = dic_data[i_rep]['v_y']
            v_x_bar = np.zeros(n_cell)
            for i in range(n_meta):
                v_x_bar[i*n_cell_per_meta : (i+1)*n_cell_per_meta] = \
                    np.mean(v_x[i*n_cell_per_meta : (i+1)*n_cell_per_meta])
            clf = linear_model.PoissonRegressor()
            res = clf.fit(v_x_bar.reshape([n_cell, 1]), v_y)
            v_beta_hat.append(clf.coef_[0])
        print('method 4: Poisson regression of v_y against v_x_bar (meta_cell_averaged v_x)')
        beta_mean = np.mean(v_beta_hat)
        beta_se = np.std(v_beta_hat) / np.sqrt(n_rep)
        print('beta_mean=%0.3f, beta_se=%0.3f, beta_z=%0.3f' % (beta_mean, beta_se, beta_mean/beta_se))
        print('')
        
        # method 5: posterior estimate of X_c 
        v_beta_hat = []
        for i_rep in range(n_rep):
            v_lambda = dic_data[i_rep]['v_lambda']
            v_x = dic_data[i_rep]['v_x']
            v_y = dic_data[i_rep]['v_y']
            v_x_hat = np.zeros(n_cell)
            for i in range(n_meta):
                temp_mean = np.mean(v_x[i*n_cell_per_meta : (i+1)*n_cell_per_meta]).clip(min=1e-6)
                temp_var = max( np.var(v_x[i*n_cell_per_meta : (i+1)*n_cell_per_meta]), temp_mean)
                w1 = (temp_var - temp_mean) / temp_var
                w2 = temp_mean / temp_var
                v_x_hat[i*n_cell_per_meta : (i+1)*n_cell_per_meta] = \
                    v_x[i*n_cell_per_meta : (i+1)*n_cell_per_meta] * w1 + temp_mean * w2
            clf = linear_model.PoissonRegressor()
            res = clf.fit(v_x_hat.reshape([n_cell, 1]), v_y)
            v_beta_hat.append(clf.coef_[0])
        print('method 5: posterior estimate of X_c ')
        beta_mean = np.mean(v_beta_hat)
        beta_se = np.std(v_beta_hat) / np.sqrt(n_rep)
        print('beta_mean=%0.3f, beta_se=%0.3f, beta_z=%0.3f' % (beta_mean, beta_se, beta_mean/beta_se))
        print('')

coef_b=1.0, sigma_lambda = 1.00e-02
method 1: Poisson regression of v_y against v_x
beta_mean=0.342, beta_se=0.021, beta_z=16.535

method 2 (oracle): Poisson regression of v_y against v_lambda
beta_mean=0.830, beta_se=0.019, beta_z=44.023

method 3 (oracle): Poisson regression of v_y against v_lambda_bar (meta_cell_averaged v_lambda)
beta_mean=0.830, beta_se=0.019, beta_z=43.955

method 4: Poisson regression of v_y against v_x_bar (meta_cell_averaged v_x)
beta_mean=0.793, beta_se=0.022, beta_z=35.355

method 5: posterior estimate of X_c 
beta_mean=0.752, beta_se=0.022, beta_z=34.448

coef_b=1.0, sigma_lambda = 1.00e-01
method 1: Poisson regression of v_y against v_x
beta_mean=0.366, beta_se=0.021, beta_z=17.542

method 2 (oracle): Poisson regression of v_y against v_lambda
beta_mean=0.858, beta_se=0.016, beta_z=52.325

method 3 (oracle): Poisson regression of v_y against v_lambda_bar (meta_cell_averaged v_lambda)
beta_mean=0.865, beta_se=0.020, beta_z=44.212

method 4: Poisson regressi

method 5: posterior estimate of X_c 
beta_mean=0.750, beta_se=0.021, beta_z=35.489

