# SoupX

In [16]:
import numpy as np
from numpy.random import multinomial, dirichlet

Parameters: G = # of genes, C = # of cells, N_mean = average UMI/cell, rho = contamination fraction

In [17]:
G = 50
C = 100
N_mean = 2000
rho = 0.1

Step 1: Generate ground truth endogenous profiles, each cell has its own gene expression profile (Dirichlet distribution)

In [18]:
pi_true = dirichlet(alpha=np.ones(G)*0.5, size=C)  # shape C x G
N_c = np.random.poisson(N_mean, size=C)

Step 2: Define ambient background profile

In [19]:
b = dirichlet(alpha=np.linspace(2, 0.5, G))  # biased ambient profile (biased towards first few genes)

Step 3: Generate observed counts according to SoupX model

In [20]:
n_obs = []
for c in range(C):
    # mixture of endogenous and ambient
    theta_c = (1 - rho) * pi_true[c] + rho * b
    counts = multinomial(N_c[c], theta_c)
    n_obs.append(counts)
n_obs = np.array(n_obs)

Step 4: Ground truth endogenous counts (for comparison)

In [21]:
m_true = np.array([multinomial(N_c[c], pi_true[c]) for c in range(C)])

Step 5: Naïve subtraction correction (SoupX Eq. 5 style)

In [22]:
m_est_sub = np.maximum(0, n_obs - (N_c[:, None] * rho * b))

Step 6: Likelihood-based correction (approximation)

In [23]:
pi_est = np.maximum(0, (n_obs / N_c[:, None]) - rho * b)
pi_est = pi_est / pi_est.sum(axis=1, keepdims=True)  # renormalize
m_est_like = (1 - rho) * N_c[:, None] * pi_est

Compare results

In [24]:
def mse(A, B):
    return ((A - B)**2).mean()

print("MSE (naïve subtraction):", mse(m_true, m_est_sub))
print("MSE (likelihood approx):", mse(m_true, m_est_like))

MSE (naïve subtraction): 119.84370556966155
MSE (likelihood approx): 122.35599340714147
