In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyreadr

from hepstats.modeling import bayesian_blocks

In [2]:
#Load data
AE3 = pyreadr.read_r('../data/family_datasets/data_norm/AE3_scran_norm.rds')
AE3 = AE3[None]

AE4 = pyreadr.read_r('../data/family_datasets/data_norm/AE4_scran_norm.rds')
AE4 = AE4[None]

D0 = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D0_RPM_norm.rds')
D0 = D0[None]

D0_2 = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D0_2_RPM_norm.rds')
D0_2 = D0_2[None]

D6 = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D6_RPM_norm.rds')
D6  = D6[None]

D6_2 = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D6_2_RPM_norm.rds')
D6_2 = D6_2[None]

D15 = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D15_RPM_norm.rds')
D15 = D15[None]

D15_2 = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D15_2_RPM_norm.rds')
D15_2 = D15_2[None]

LK = pd.read_csv ('../data/family_datasets/data_norm/Weinreb_LK_D2_exp1_norm_lifted.csv')
LK = LK.set_index('Unnamed: 0')     
LK = LK.rename_axis(None)

LK_2 = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LK_D2_exp2_norm_lifted.csv')
LK_2 = LK_2.set_index('Unnamed: 0')    
LK_2 = LK_2.rename_axis(None)

LSK = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LSK_D2_exp1_norm_lifted.csv')
LSK = LSK.set_index('Unnamed: 0')     
LSK = LSK.rename_axis(None)

LSK_2 = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LSK_D2_exp2_norm_lifted.csv')
LSK_2 = LSK_2.set_index('Unnamed: 0')     
LSK_2 = LSK_2.rename_axis(None)

LSKmix = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LK_LSK_D2_exp3_norm_lifted.csv')
LSKmix = LSKmix.set_index('Unnamed: 0') 
LSKmix = LSKmix.rename_axis(None)

In [None]:
#Get common genes name
genes = AE3.index.intersection(LK.index)
genes = genes.intersection(LSK.index)
genes = genes.intersection(LSKmix.index)
LK_2.loc[genes,] 

In [6]:
#Binning of gene expression in each cell or gene using bayesian block or binary binning
def binning_norm_bycell(norm:pd.DataFrame, name_data:str, binning_method:str, bycell:bool):
    counts = []
    
    length = norm.shape[1]
    if bycell == False:
        length = norm.shape[0]
        
    for i in range(0,length):
        if bycell == True:
            data = norm.iloc[:,i]
        else:
            data = norm.iloc[i,:]

        #Get bin edges using bayesian blocks and count of binning
        if binning_method == 'bayesian':
            bins = bayesian_blocks(data)
        else:
            bins = [0,1e-4,max(data)]
        if (len(np.nonzero(np.array(data))[0]) == 0):
            bins = [0,1e-4]

        count, bin_edges = np.histogram(data, bins)
        counts.append(count)
    
    by = 'bycell'
    if bycell == False:
            by = 'bygene'
            
    output_path = '../data/count_histogram/' + name_data + binning_method + '_' + by +'.csv'
    pd.DataFrame(counts).to_csv(output_path, index=False)

In [7]:
norms = [AE3, AE4, D0, D0_2, D6, D6_2, D15, D15_2, LK, LK_2, LSK, LSK_2, LSKmix]
norms_name = ['AE3', 'AE4', 'D0', 'D0_2', 'D6', 'D6_2', 'D15', 'D15_2', 'LK', 'LK_2', 'LSK', 'LSK_2', 'LSKmix']

In [17]:
norms = [AE3, AE4, D0, D0_2, D6, D6_2, D15, D15_2, LK, LK_2, LSK, LSK_2, LSKmix]
norms_name = ['AE3', 'AE4', 'D0', 'D0_2', 'D6', 'D6_2', 'D15', 'D15_2', 'LK', 'LK_2', 'LSK', 'LSK_2', 'LSKmix']

for i, norm in enumerate(norms):
    print(norms_name[i], ' shape: ', norm.shape)
    
    binning_norm_bycell(norm, norms_name[i], 'bayesian', True)

AE3  shape:  (32285, 1268)
AE4  shape:  (32285, 7094)
D0  shape:  (32285, 2887)
D0_2  shape:  (32285, 11457)
D6  shape:  (32285, 4836)
D6_2  shape:  (32285, 2909)
D15  shape:  (32285, 4106)
D15_2  shape:  (32285, 5641)
LK  shape:  (25289, 7793)
LK_2  shape:  (25289, 222)
LSK  shape:  (25289, 20456)
LSK_2  shape:  (25289, 18823)
LSKmix  shape:  (25289, 15728)


In [9]:
for i, norm in enumerate(norms):
    print(norms_name[i], ' shape: ', norm.shape)
    
    binning_norm_bycell(norm, norms_name[i], 'binary', True)

AE3  shape:  (32285, 1268)
AE4  shape:  (32285, 7094)
D0  shape:  (32285, 2887)
D0_2  shape:  (32285, 11457)
D6  shape:  (32285, 4836)
D6_2  shape:  (32285, 2909)
D15  shape:  (32285, 4106)
D15_2  shape:  (32285, 5641)
LK  shape:  (25289, 7793)
LK_2  shape:  (25289, 222)
LSK  shape:  (25289, 20456)
LSK_2  shape:  (25289, 18823)
LSKmix  shape:  (25289, 15728)


In [8]:
for i, norm in enumerate(norms):
    print(norms_name[i], ' shape: ', norm.shape)
    
    binning_norm_bycell(norm, norms_name[i], 'bayesian', False)

AE3  shape:  (32285, 1268)


  fit_vec = N_k * (np.log(N_k / T_k))


AE4  shape:  (32285, 7094)
D0  shape:  (32285, 2887)
D0_2  shape:  (32285, 11457)
D6  shape:  (32285, 4836)
D6_2  shape:  (32285, 2909)
D15  shape:  (32285, 4106)
D15_2  shape:  (32285, 5641)
LK  shape:  (25289, 7793)
LK_2  shape:  (25289, 222)
LSK  shape:  (25289, 20456)
LSK_2  shape:  (25289, 18823)
LSKmix  shape:  (25289, 15728)


In [15]:
for i, norm in enumerate(norms):
    print(norms_name[i], ' shape: ', norm.shape)
    
    binning_norm_bycell(norm, norms_name[i], 'binary', False)

AE3  shape:  (32285, 1268)
AE4  shape:  (32285, 7094)
D0  shape:  (32285, 2887)
D0_2  shape:  (32285, 11457)
D6  shape:  (32285, 4836)
D6_2  shape:  (32285, 2909)
D15  shape:  (32285, 4106)
D15_2  shape:  (32285, 5641)
LK  shape:  (25289, 7793)
LK_2  shape:  (25289, 222)
LSK  shape:  (25289, 20456)
LSK_2  shape:  (25289, 18823)
LSKmix  shape:  (25289, 15728)
