# Iteration 0: Cell to cell type assignemt

### This is a notebook demonstrating what happens under the hood in the cell to cell type step in pciSeq

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy.special import softmax
import gdown

In [2]:
!pip install git+https://github.com/acycliq/pciSeq.git@dev_3d
import pciSeq

Collecting git+https://github.com/acycliq/pciSeq.git@dev_3d
  Cloning https://github.com/acycliq/pciSeq.git (to revision dev_3d) to /tmp/pip-req-build-u_w9b4vu
  Running command git clone --filter=blob:none --quiet https://github.com/acycliq/pciSeq.git /tmp/pip-req-build-u_w9b4vu
  Running command git checkout -b dev_3d --track origin/dev_3d
  Switched to a new branch 'dev_3d'
  Branch 'dev_3d' set up to track remote branch 'dev_3d' from 'origin'.
  Resolved https://github.com/acycliq/pciSeq.git to commit 31fd7688f1f848d9e59931636ce39479614ea23d
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [3]:
file_id = "1wY1PUvAwk_S9MhKbCyB0HzH9oxNloP0M"
url = f"https://drive.google.com/uc?export=download&id={file_id}"
output = "pciSeq.pickle"

In [4]:
gdown.download(url, output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?export=download&id=1wY1PUvAwk_S9MhKbCyB0HzH9oxNloP0M
From (redirected): https://drive.google.com/uc?export=download&id=1wY1PUvAwk_S9MhKbCyB0HzH9oxNloP0M&confirm=t&uuid=5acfcca8-3289-4e0f-9685-c474690087ce
To: /content/pciSeq.pickle
100%|██████████| 1.79G/1.79G [00:18<00:00, 96.2MB/s]


'pciSeq.pickle'

In [5]:
obj = pd.read_pickle(output)

In [6]:
my_cell = 9382

In [7]:
# Set the hyperparameters
rSpot = 2.0               # Spread of the negative binomial
SpotReg = 0.1             # Spot regularization: Additive factor applied to the single cell data
inefficiency = 0.2        # multiplicative factor applied to the single cell data

In [8]:
# Another piece of information comes from the single cell data. For simplicity
# we are pulling here only the gene expressions for Oligos since the cell under
# investigation looks like an Oligo
my_class_definitions = obj.single_cell.mean_expression[
['MFOL1', 'MFOL2', 'MOL1', 'MOL2', 'MOL3', 'Zero']
]
my_class_definitions

class_name,MFOL1,MFOL2,MOL1,MOL2,MOL3,Zero
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abi3bp,0.001086,0.000414,0.000344,0.000300,0.000000,0.0
Acly,0.913409,0.525880,0.378887,0.774448,0.926447,0.0
Adcyap1,0.001629,0.001656,0.000344,0.000499,0.001082,0.0
Adora2a,0.000000,0.002899,0.001721,0.001398,0.001082,0.0
Afp,0.000271,0.000000,0.000000,0.000400,0.000000,0.0
...,...,...,...,...,...,...
Vtn,0.006515,0.008282,0.005622,0.004995,0.008112,0.0
Wfs1,0.017372,0.028157,0.014917,0.027170,0.038940,0.0
Yjefn3,0.009501,0.031884,0.027768,0.025672,0.075176,0.0
Zcchc12,0.002986,0.011180,0.005393,0.006193,0.001082,0.0


### Part 1. Initialization

In [9]:
# To set the initial gene counts for any given cell we collect the gene reads inside the cell boundaries
# For cell 9382 these gene reads are the spots below
spots_9382 = obj.spots.data[obj.spots.data.label == my_cell]
spots_9382

Unnamed: 0_level_0,x,y,z,plane_id,label,gene_name,score
spot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
119596,5600.880859,2461.592529,123.219452,38,9382,Hsd11b1,0.7876
119601,5597.880859,2464.592529,126.433739,39,9382,Hsd11b1,0.886
126235,5596.880859,2448.592529,94.290878,29,9382,Maob,0.9536
139976,5619.880859,2433.592529,132.862305,41,9382,Plp1,0.867
140002,5613.880859,2447.592529,123.219452,38,9382,Plp1,0.8286
140004,5601.880859,2448.592529,123.219452,38,9382,Plp1,0.5073
140005,5606.880859,2448.592529,132.862305,41,9382,Plp1,0.9727
140029,5605.880859,2458.592529,120.005165,37,9382,Plp1,0.7896
145503,5619.880859,2438.592529,126.433739,39,9382,Qk,0.5947
148852,5612.880859,2434.592529,120.005165,37,9382,Rims4,0.9326


In [10]:
# Hence we aggregate per gene and we derive the gene counts
cgc = spots_9382.groupby('gene_name').size()
cgc

Unnamed: 0_level_0,0
gene_name,Unnamed: 1_level_1
Hsd11b1,2
Maob,1
Plp1,5
Qk,1
Rims4,1


In [11]:
# The cell gene counts introduced above contain only genes with gene reads > 0.
# We need however the full gene panel
cgc = cgc.reindex(obj.genes.gene_panel, fill_value=0)

In [12]:
# Note also that the eta (the random variable to model the gene inefficiency) is initialied as a vector of ones with lenght
# equal to the number of genes:
eta_bar = np.ones(obj.nG)

### Part 2. Cell to cell type

In [13]:
def negative_binomial_loglikelihood(x: np.ndarray, r: float, p: np.ndarray) -> np.ndarray:
    log_likelihood = x * np.log(p) + r * np.log(1 - p)

    return log_likelihood

In [14]:
# prepare now the data from the negative binomial
ScaledExp = my_class_definitions * inefficiency * eta_bar[:, None]
ScaledExp = ScaledExp + SpotReg
p = ScaledExp / (ScaledExp + rSpot)

In [15]:
# evaluate the loglikelihood
oligo_contr = negative_binomial_loglikelihood(cgc.values[:, None], rSpot, p.values)
oligo_contr.sum(axis=0)

array([-68.75759028, -79.63132499, -74.07906727, -70.58419765,
       -80.69293882, -61.08544748])

In [16]:
# some housekeeping, results look better in a dataframe
oligo_contr_df = pd.DataFrame(oligo_contr, columns=['MFOL1', 'MFOL2', 'MOL1', 'MOL2', 'MOL3', 'Zero']).set_index(obj.genes.gene_panel)
oligo_contr_df

Unnamed: 0,MFOL1,MFOL2,MOL1,MOL2,MOL3,Zero
Abi3bp,-0.097787,-0.097659,-0.097646,-0.097637,-0.097580,-0.09758
Acly,-0.264408,-0.195320,-0.168478,-0.239908,-0.266691,-0.09758
Adcyap1,-0.097891,-0.097896,-0.097646,-0.097675,-0.097786,-0.09758
Adora2a,-0.097580,-0.098132,-0.097908,-0.097847,-0.097786,-0.09758
Afp,-0.097632,-0.097580,-0.097580,-0.097656,-0.097580,-0.09758
...,...,...,...,...,...,...
Vtn,-0.098821,-0.099157,-0.098651,-0.098531,-0.099125,-0.09758
Wfs1,-0.100887,-0.102936,-0.100420,-0.102749,-0.104984,-0.09758
Yjefn3,-0.099389,-0.103644,-0.102863,-0.102464,-0.111849,-0.09758
Zcchc12,-0.098149,-0.099709,-0.098607,-0.098760,-0.097786,-0.09758


In [17]:
# Get the total loglikelihood
oligo_contr_df.sum(axis=0)

Unnamed: 0,0
MFOL1,-68.75759
MFOL2,-79.631325
MOL1,-74.079067
MOL2,-70.584198
MOL3,-80.692939
Zero,-61.085447


I seems that the Zero class has the highest loglikelihood

In [18]:
# Lets now compare the Zero class to MFOL1 (second best)
my_df = oligo_contr_df[['MFOL1', 'Zero']]

In [19]:
# calc the difference between the contributions
diff = oligo_contr_df['MFOL1'] - oligo_contr_df['Zero']
my_df = my_df.assign(diff = diff)

In [20]:
# Top 5 genes that are cotributing most for the MFOL1 cell type
my_df.sort_values(by='diff', ascending=False).head(5)

Unnamed: 0,MFOL1,Zero,diff
Plp1,-7.770282,-15.320193,7.54991
Qk,-2.252219,-3.142103,0.889883
Maob,-3.115147,-3.142103,0.026956
Hsd11b1,-6.171955,-6.186625,0.01467
Rims4,-3.135156,-3.142103,0.006947


In [21]:
# Top 5 genes that are cotributing most for the Zero cell type
my_df.sort_values(by='diff', ascending=False).tail(5)

Unnamed: 0,MFOL1,Zero,diff
Mog,-1.325219,-0.09758,-1.227639
Mag,-1.365887,-0.09758,-1.268306
Cryab,-1.367586,-0.09758,-1.270006
Tspan2,-1.75426,-0.09758,-1.65668
Mobp,-2.557112,-0.09758,-2.459531


##### Loglikelihood sanity checking
The loglikelihood defined above doesnt include the combinatorial factor. If that raises questions whether it
should be included or not, I am re-doing the steps above using the official logdensity from python

In [22]:
from scipy.stats import nbinom

In [23]:
oligo_contr_2 = nbinom.logpmf(cgc.values[:, None], rSpot, 1-p.values)

In [24]:
oligo_contr_df_2 = pd.DataFrame(oligo_contr_2, columns=['MFOL1', 'MFOL2', 'MOL1', 'MOL2', 'MOL3', 'Zero']).set_index(obj.genes.gene_panel)
oligo_contr_df_2

Unnamed: 0,MFOL1,MFOL2,MOL1,MOL2,MOL3,Zero
Abi3bp,-0.097787,-0.097659,-0.097646,-0.097637,-0.097580,-0.09758
Acly,-0.264408,-0.195320,-0.168478,-0.239908,-0.266691,-0.09758
Adcyap1,-0.097891,-0.097896,-0.097646,-0.097675,-0.097786,-0.09758
Adora2a,-0.097580,-0.098132,-0.097908,-0.097847,-0.097786,-0.09758
Afp,-0.097632,-0.097580,-0.097580,-0.097656,-0.097580,-0.09758
...,...,...,...,...,...,...
Vtn,-0.098821,-0.099157,-0.098651,-0.098531,-0.099125,-0.09758
Wfs1,-0.100887,-0.102936,-0.100420,-0.102749,-0.104984,-0.09758
Yjefn3,-0.099389,-0.103644,-0.102863,-0.102464,-0.111849,-0.09758
Zcchc12,-0.098149,-0.099709,-0.098607,-0.098760,-0.097786,-0.09758


In [25]:
my_df_2 = oligo_contr_df_2[['MFOL1', 'Zero']]

In [26]:
diff_2 = oligo_contr_df_2['MFOL1'] - oligo_contr_df_2['Zero']
my_df_2 = my_df_2.assign(diff = diff_2)

In [27]:
# Top 5 genes that are cotributing most for the MFOL1 cell type
my_df_2.sort_values(by='diff', ascending=False).head(5)

Unnamed: 0,MFOL1,Zero,diff
Plp1,-5.978523,-13.528433,7.54991
Qk,-1.559072,-2.448956,0.889883
Maob,-2.422,-2.448956,0.026956
Hsd11b1,-5.073343,-5.088013,0.01467
Rims4,-2.442008,-2.448956,0.006947


In [28]:
# Top 5 genes that are cotributing most for the Zero cell type
my_df_2.sort_values(by='diff', ascending=False).tail(5)

Unnamed: 0,MFOL1,Zero,diff
Mog,-1.325219,-0.09758,-1.227639
Mag,-1.365887,-0.09758,-1.268306
Cryab,-1.367586,-0.09758,-1.270006
Tspan2,-1.75426,-0.09758,-1.65668
Mobp,-2.557112,-0.09758,-2.459531


Including the combinatorial factor in the loglikehood changes the contributions,
When we compare however two celltypes, the difference per gene is the same, see below:

In [29]:
d = my_df_2['diff']-my_df['diff']
d.sort_values(ascending=False)

Unnamed: 0,diff
Zic1,0.000000e+00
Abi3bp,0.000000e+00
Acly,0.000000e+00
Adcyap1,0.000000e+00
Adora2a,0.000000e+00
...,...
Rims4,-8.881784e-16
Maob,-8.881784e-16
Qk,-1.332268e-15
Hsd11b1,-2.664535e-15


##### Conclusion: Five Plp1 reads were found, but that alone isn’t enough to label the cell as an oligodendrocyte.