In [1]:
%cd "../"
%pwd

import numpy as np
import matplotlib.pyplot as plt
import io 
import pandas as pd 
import pyreadr

from load_data import *
from pred_score import *

/Users/dormann/Documents/GitHub/src


# mESC cells (AE3, AE4)

In [2]:
#AE3
AE3_df = pyreadr.read_r('../data/family_datasets/data_norm/AE3_scran_norm.rds')
AE3_df = AE3_df[None]

families_interest_AE3 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_AE3_nocellcyclesplit.RData')
families_interest_AE3 = families_interest_AE3['fois_1']

families_info_AE3 = pyreadr.read_r('../data/family_datasets/family_info/family_info_AE3_nocellcyclesplit.RData')
families_info_AE3 = families_info_AE3['family_info_1']

#Conversion into array 
families_interest_AE3 = np.array(families_interest_AE3)
families_info_AE3 = np.array(families_info_AE3) 

In [3]:
#Norm data with only the cells belonging to the family of interest
AE3_ano = select_family_interest_norm_data(families_info_AE3, families_interest_AE3, AE3_df)
y_AE3 = AE3_ano[1]
y_AE3 = y_AE3[:,0]
y_AE3 = y_AE3.astype(np.int32)
AE3_ano = AE3_ano[0]

In [4]:
#Get cell.barcode of annotated cells
cells_annotated = AE3_ano.columns
y_AE3 = pd.DataFrame(y_AE3, index= cells_annotated)

#Add family column
family = pd.DataFrame(np.zeros((AE3_df.shape[1])), index= AE3_df.columns)

for cell in cells_annotated:
    family.loc[cell] = y_AE3.loc[cell]

AE3 = AE3_df.T

### Prediction on all cell, evaluate on the labeled cells

In [6]:
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/AE3genes_bestMIM.csv'))

#Only keep the optimized genes
AE3 = AE3[gene_optimized]
print(AE3.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)

(1268, 620)


In [13]:
model = FamiliesClusters(np.unique(family),compute_precision,True)
pred = model.fit_predict(np.array(AE3),np.array(family))
pred = pd.DataFrame(pred, index = family.index)

#Only evaluate on annotated cells
pred = pred.loc[cells_annotated]

score = compute_precision(np.array(y_AE3), np.squeeze(np.array(pred)))
recovery = compute_recovery(np.array(pred))

print(score, recovery)

0.3651804670912951 0.984984984984985


In [15]:
model = EnsemblingHierarchical(np.unique(family),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = np.array(AE3), y= np.array(family))
result = pd.DataFrame(result, index = family.index)

#Only evaluate on annotated cells
result = result.loc[cells_annotated]

score = compute_precision(np.array(y_AE3), np.squeeze(np.array(result)))
recovery = compute_recovery(np.array(result))

print(score, recovery)

0.3339920948616601 0.9039039039039038


In [17]:
#AE4
AE4_df = pyreadr.read_r('../data/family_datasets/data_norm/AE4_scran_norm.rds')
AE4_df = AE4_df[None]

families_interest_AE4 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_AE4_nocellcyclesplit.RData')
families_interest_AE4 = families_interest_AE4['fois_1']

families_info_AE4 = pyreadr.read_r('../data/family_datasets/family_info/family_info_AE4_nocellcyclesplit.RData')
families_info_AE4 = families_info_AE4['family_info_1']

#Conversion into array 
families_interest_AE4 = np.array(families_interest_AE4)
families_info_AE4 = np.array(families_info_AE4) 

In [18]:
#Norm data with only the cells belonging to the family of interest
AE4_ano = select_family_interest_norm_data(families_info_AE4, families_interest_AE4, AE4_df)
y_AE4 = AE4_ano[1]
y_AE4 = y_AE4[:,0]
y_AE4 = y_AE4.astype(np.int32)
AE4_ano = AE4_ano[0]

In [19]:
#Get cell.barcode of annotated cells
cells_annotated = AE4_ano.columns
y_AE4 = pd.DataFrame(y_AE4, index= cells_annotated)

#Add family column
family = pd.DataFrame(np.zeros((AE4_df.shape[1])), index= AE4_df.columns)

for cell in cells_annotated:
    family.loc[cell] = y_AE4.loc[cell]

AE4 = AE4_df.T

### Prediction on all cell, evaluate on the labeled cells

In [20]:
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/AE4genes_bestANOVA.csv'))

#Only keep the optimized genes
AE4 = AE4[gene_optimized]
print(AE4.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)

(7094, 255)


In [21]:
model = FamiliesClusters(np.unique(family),compute_precision,True)
pred = model.fit_predict(np.array(AE4),np.array(family))
pred = pd.DataFrame(pred, index = family.index)

#Only evaluate on annotated cells
pred = pred.loc[cells_annotated]

score = compute_precision(np.array(y_AE4), np.squeeze(np.array(pred)))
recovery = compute_recovery(np.array(pred))

print(score, recovery)

0.12551561579257514 0.9850644942294636


In [22]:
model = EnsemblingHierarchical(np.unique(family),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = np.array(AE4), y= np.array(family))
result = pd.DataFrame(result, index = family.index)

#Only evaluate on annotated cells
result = result.loc[cells_annotated]

score = compute_precision(np.array(y_AE4), np.squeeze(np.array(result)))
recovery = compute_recovery(np.array(result))

print(score, recovery)

0.5596707818930041 0.34419551934826886


# MEF cells (BIDDY DO, DO_2, D6, D6_2, D15, D15_2)
Let's first load all the data containing the MEF cells, the BIDDY DO, DO_2, D6, D6_2, D15, and D15_2 datasets. Import normalized data, data families interest and data family info

# DO

In [23]:
#D0
D0_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D0_RPM_norm.rds')
D0_df = D0_df[None]

families_interest_D0 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D0.RData')
families_interest_D0 = families_interest_D0['fois_1']

families_info_D0 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D0.RData')
families_info_D0 = families_info_D0['family_info_1']

#Conversion into array 
families_interest_D0 = np.array(families_interest_D0)
families_info_D0 = np.array(families_info_D0) 

In [24]:
#Norm data with only the cells belonging to the family of interest
D0_ano = select_family_interest_norm_data(families_info_D0, families_interest_D0, D0_df)
y_D0 = D0_ano[1]
y_D0 = y_D0[:,0]
y_D0 = y_D0.astype(np.int32)
D0_ano = D0_ano[0]

In [25]:
#Get cell.barcode of annotated cells
cells_annotated = D0_ano.columns
y_D0 = pd.DataFrame(y_D0, index= cells_annotated)

#Add family column
family = pd.DataFrame(np.zeros((D0_df.shape[1])), index= D0_df.columns)

for cell in cells_annotated:
    family.loc[cell] = y_D0.loc[cell]

D0 = D0_df.T

### Prediction on all cell, evaluate on the labeled cells

In [26]:
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/D0genes_bestMIM.csv'))

#Only keep the optimized genes
D0 = D0[gene_optimized]
print(D0.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)

(2887, 2480)


In [27]:
model = FamiliesClusters(np.unique(family),compute_precision,True)
pred = model.fit_predict(np.array(D0),np.array(family))
pred = pd.DataFrame(pred, index = family.index)

#Only evaluate on annotated cells
pred = pred.loc[cells_annotated]

score = compute_precision(np.array(y_D0), np.squeeze(np.array(pred)))
recovery = compute_recovery(np.array(pred))

print(score, recovery)

0.4032258064516129 1.0


In [28]:
model = EnsemblingHierarchical(np.unique(family),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = np.array(D0), y= np.array(family))
result = pd.DataFrame(result, index = family.index)

#Only evaluate on annotated cells
result = result.loc[cells_annotated]

score = compute_precision(np.array(y_D0), np.squeeze(np.array(result)))
recovery = compute_recovery(np.array(result))

print(score, recovery)

0.037604456824512536 0.9910714285714286


In [29]:
#D0
D0_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D0_2_RPM_norm.rds')
D0_df = D0_df[None]

families_interest_D0 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D0_2.RData')
families_interest_D0 = families_interest_D0['fois_1']

families_info_D0 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D0_2.RData')
families_info_D0 = families_info_D0['family_info_1']

#Conversion into array 
families_interest_D0 = np.array(families_interest_D0)
families_info_D0 = np.array(families_info_D0) 

In [30]:
#Norm data with only the cells belonging to the family of interest
D0_ano = select_family_interest_norm_data(families_info_D0, families_interest_D0, D0_df)
y_D0 = D0_ano[1]
y_D0 = y_D0[:,0]
y_D0 = y_D0.astype(np.int32)
D0_ano = D0_ano[0]

In [31]:
#Get cell.barcode of annotated cells
cells_annotated = D0_ano.columns
y_D0 = pd.DataFrame(y_D0, index= cells_annotated)

#Add family column
family = pd.DataFrame(np.zeros((D0_df.shape[1])), index= D0_df.columns)

for cell in cells_annotated:
    family.loc[cell] = y_D0.loc[cell]

D0 = D0_df.T

### Prediction on all cell, evaluate on the labeled cells

In [32]:
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/D0genes_bestMIM.csv'))

#Only keep the optimized genes
D0 = D0[gene_optimized]
print(D0.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)

(11457, 2480)


In [33]:
model = FamiliesClusters(np.unique(family),compute_precision,True)
pred = model.fit_predict(np.array(D0),np.array(family))
pred = pd.DataFrame(pred, index = family.index)

#Only evaluate on annotated cells
pred = pred.loc[cells_annotated]

score = compute_precision(np.array(y_D0), np.squeeze(np.array(pred)))
recovery = compute_recovery(np.array(pred))

print(score, recovery)

0.2857142857142857 1.0


In [None]:
model = EnsemblingHierarchical(np.unique(family),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = np.array(D0), y= np.array(family))
result = pd.DataFrame(result, index = family.index)

#Only evaluate on annotated cells
result = result.loc[cells_annotated]

score = compute_precision(np.array(y_D0), np.squeeze(np.array(result)))
recovery = compute_recovery(np.array(result))

print(score, recovery)

# D6

In [None]:
#D0
D6_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D6_RPM_norm.rds')
D6_df = D6_df[None]

families_interest_D6 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D6_V2.RData')
families_interest_D6 = families_interest_D6['fois_1']

families_info_D6 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D6_V2.RData')
families_info_D6 = families_info_D6['family_info_1']

#Conversion into array 
families_interest_D6 = np.array(families_interest_D6)
families_info_D6 = np.array(families_info_D6) 

In [None]:
#Norm data with only the cells belonging to the family of interest
D6_ano = select_family_interest_norm_data(families_info_D6, families_interest_D6, D6_df)
y_D6 = D6_ano[1]
y_D6 = y_D6[:,0]
y_D6 = y_D6.astype(np.int32)
D6_ano = D6_ano[0]

In [None]:
#Get cell.barcode of annotated cells
cells_annotated = D6_ano.columns
y_D6 = pd.DataFrame(y_D6, index= cells_annotated)

#Add family column
family = pd.DataFrame(np.zeros((D6_df.shape[1])), index= D6_df.columns)

for cell in cells_annotated:
    family.loc[cell] = y_D6.loc[cell]

D6 = D6_df.T

### Prediction on all cell, evaluate on the labeled cells

In [None]:
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/D6genes_bestANOVA.csv'))

#Only keep the optimized genes
D6 = D6[gene_optimized]
print(D6.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)

In [None]:
model = FamiliesClusters(np.unique(family),compute_precision,True)
pred = model.fit_predict(np.array(D6),np.array(family))
pred = pd.DataFrame(pred, index = family.index)

#Only evaluate on annotated cells
pred = pred.loc[cells_annotated]

score = compute_precision(np.array(y_D6), np.squeeze(np.array(pred)))
recovery = compute_recovery(np.array(pred))

print(score, recovery)

In [None]:
model = EnsemblingHierarchical(np.unique(family),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = np.array(D6), y= np.array(family))
result = pd.DataFrame(result, index = family.index)

#Only evaluate on annotated cells
result = result.loc[cells_annotated]

score = compute_precision(np.array(y_D6), np.squeeze(np.array(result)))
recovery = compute_recovery(np.array(result))

print(score, recovery)

In [None]:
#D6
D6_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D6_2_RPM_norm.rds')
D6_df = D6_df[None]

families_interest_D6 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D6_2.RData')
families_interest_D6 = families_interest_D6['fois_1']

families_info_D6 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D6_2.RData')
families_info_D6 = families_info_D6['family_info_1']

#Conversion into array 
families_interest_D6 = np.array(families_interest_D6)
families_info_D6 = np.array(families_info_D6) 

In [None]:
#Norm data with only the cells belonging to the family of interest
D6_ano = select_family_interest_norm_data(families_info_D6, families_interest_D6, D6_df)
y_D6 = D6_ano[1]
y_D6 = y_D6[:,0]
y_D6 = y_D6.astype(np.int32)
D6_ano = D6_ano[0]

In [None]:
#Get cell.barcode of annotated cells
cells_annotated = D6_ano.columns
y_D6 = pd.DataFrame(y_D6, index= cells_annotated)

#Add family column
family = pd.DataFrame(np.zeros((D6_df.shape[1])), index= D6_df.columns)

for cell in cells_annotated:
    family.loc[cell] = y_D6.loc[cell]

D6 = D6_df.T

### Prediction on all cell, evaluate on the labeled cells

In [None]:
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/D6genes_bestANOVA.csv'))

#Only keep the optimized genes
D6 = D6[gene_optimized]
print(D6.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)

In [None]:
model = FamiliesClusters(np.unique(family),compute_precision,True)
pred = model.fit_predict(np.array(D6),np.array(family))
pred = pd.DataFrame(pred, index = family.index)

#Only evaluate on annotated cells
pred = pred.loc[cells_annotated]

score = compute_precision(np.array(y_D6), np.squeeze(np.array(pred)))
recovery = compute_recovery(np.array(pred))

print(score, recovery)

In [None]:
model = EnsemblingHierarchical(np.unique(family),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = np.array(D6), y= np.array(family))
result = pd.DataFrame(result, index = family.index)

#Only evaluate on annotated cells
result = result.loc[cells_annotated]

score = compute_precision(np.array(y_D6), np.squeeze(np.array(result)))
recovery = compute_recovery(np.array(result))

print(score, recovery)

# D15

In [None]:
#D15
D15_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D15_RPM_norm.rds')
D15_df = D15_df[None]

families_interest_D15 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D15_V3.RData')
families_interest_D15 = families_interest_D15['fois_1']

families_info_D15 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D15_V3.RData')
families_info_D15 = families_info_D15['family_info_1']

#Conversion into array 
families_interest_D15 = np.array(families_interest_D15)
families_info_D15 = np.array(families_info_D15) 

In [None]:
#Norm data with only the cells belonging to the family of interest
D15_ano = select_family_interest_norm_data(families_info_D15, families_interest_D15, D15_df)
y_D15 = D15_ano[1]
y_D15 = y_D15[:,0]
y_D15 = y_D15.astype(np.int32)
D15_ano = D15_ano[0]

In [None]:
#Get cell.barcode of annotated cells
cells_annotated = D15_ano.columns
y_D15 = pd.DataFrame(y_D15, index= cells_annotated)

#Add family column
family = pd.DataFrame(np.zeros((D15_df.shape[1])), index= D15_df.columns)

for cell in cells_annotated:
    family.loc[cell] = y_D15.loc[cell]

D15 = D15_df.T

### Prediction on all cell, evaluate on the labeled cells

In [None]:
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/D15genes_bestANOVA.csv'))

#Only keep the optimized genes
D15 = D15[gene_optimized]
print(D15.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)

In [None]:
model = FamiliesClusters(np.unique(family),compute_precision,True)
pred = model.fit_predict(np.array(D15),np.array(family))
pred = pd.DataFrame(pred, index = family.index)

#Only evaluate on annotated cells
pred = pred.loc[cells_annotated]

score = compute_precision(np.array(y_D15), np.squeeze(np.array(pred)))
recovery = compute_recovery(np.array(pred))

print(score, recovery)

In [None]:
model = EnsemblingHierarchical(np.unique(family),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = np.array(D15), y= np.array(family))
result = pd.DataFrame(result, index = family.index)

#Only evaluate on annotated cells
result = result.loc[cells_annotated]

score = compute_precision(np.array(y_D15), np.squeeze(np.array(result)))
recovery = compute_recovery(np.array(result))

print(score, recovery)

In [None]:
#D15
D15_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D15_2_RPM_norm.rds')
D15_df = D15_df[None]

families_interest_D15 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D15_2.RData')
families_interest_D15 = families_interest_D15['fois_1']

families_info_D15 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D15_2.RData')
families_info_D15 = families_info_D15['family_info_1']

#Conversion into array 
families_interest_D15 = np.array(families_interest_D15)
families_info_D15 = np.array(families_info_D15) 

In [None]:
#Norm data with only the cells belonging to the family of interest
D15_ano = select_family_interest_norm_data(families_info_D15, families_interest_D15, D15_df)
y_D15 = D15_ano[1]
y_D15 = y_D15[:,0]
y_D15 = y_D15.astype(np.int32)
D15_ano = D15_ano[0]

In [None]:
#Get cell.barcode of annotated cells
cells_annotated = D15_ano.columns
y_D15 = pd.DataFrame(y_D15, index= cells_annotated)

#Add family column
family = pd.DataFrame(np.zeros((D15_df.shape[1])), index= D15_df.columns)

for cell in cells_annotated:
    family.loc[cell] = y_D15.loc[cell]

D15 = D15_df.T

### Prediction on all cell, evaluate on the labeled cells

In [None]:
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/D15genes_bestANOVA.csv'))

#Only keep the optimized genes
D15 = D15[gene_optimized]
print(D15.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)

In [None]:
model = FamiliesClusters(np.unique(family),compute_precision,True)
pred = model.fit_predict(np.array(D15),np.array(family))
pred = pd.DataFrame(pred, index = family.index)

#Only evaluate on annotated cells
pred = pred.loc[cells_annotated]

score = compute_precision(np.array(y_D15), np.squeeze(np.array(pred)))
recovery = compute_recovery(np.array(pred))

print(score, recovery)

In [None]:
model = EnsemblingHierarchical(np.unique(family),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = np.array(D15), y= np.array(family))
result = pd.DataFrame(result, index = family.index)

#Only evaluate on annotated cells
result = result.loc[cells_annotated]

score = compute_precision(np.array(y_D15), np.squeeze(np.array(result)))
recovery = compute_recovery(np.array(result))

print(score, recovery)

# LK and LSK cells: Weinreb data

# LK cells:

In [2]:
#LK
LK_scran_df = pd.read_csv ('../data/family_datasets/data_norm/Weinreb_LK_D2_exp1_norm_lifted.csv')
LK_scran_df = LK_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LK_scran_df = LK_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none


data_families_interest_LK = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LK_D2_exp1.RData')
data_families_interest_LK = data_families_interest_LK['fois_1']


data_families_info_LK = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LK_D2_exp1.RData')
data_families_info_LK = data_families_info_LK['WORK_clones']

# conversion into array 
LK_scran = np.array(LK_scran_df)
data_families_interest_LK = np.array(data_families_interest_LK)
data_families_info_LK = np.array(data_families_info_LK)  

data_families_info_LK[:,0] = data_families_info_LK[:,0].astype(int)

In [3]:
#Norm data with only the cells belonging to the family of interest
LK_scran_df = select_family_interest_norm_data(data_families_info_LK, data_families_interest_LK, LK_scran_df)
y_LK = LK_scran_df[1]
y_LK = y_LK[:,0]
y_LK = y_LK.astype(np.int32)
LK_scran_df = LK_scran_df[0]

LK = np.array(LK_scran_df)
print(np.shape(LK))

(25289, 531)


In [4]:
#LK2
LK2_scran_df = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LK_D2_exp2_norm_lifted.csv')
LK2_scran_df = LK2_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LK2_scran_df = LK2_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
#nan_genes = LK2_scran_df.any(axis=1)
#LK2_scran_df = LK2_scran_df[nan_genes]

data_families_interest_LK2 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LK_D2_exp2.RData')
data_families_interest_LK2 = data_families_interest_LK2['fois_1']

data_families_info_LK2 = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LK_D2_exp2.RData')
data_families_info_LK2 = data_families_info_LK2['WORK_clones']

# conversion into array 
LK2_scran = np.array(LK2_scran_df)
data_families_interest_LK2 = np.array(data_families_interest_LK2)
data_families_info_LK2 = np.array(data_families_info_LK2) 
data_families_info_LK2[:,0] = data_families_info_LK2[:,0].astype(int)

In [5]:
#Norm data with only the cells belonging to the family of interest
LK2_scran_df = select_family_interest_norm_data(data_families_info_LK2, data_families_interest_LK2, LK2_scran_df)
y_LK2 = LK2_scran_df[1]
y_LK2 = y_LK2[:,0]
y_LK2 = y_LK2.astype(np.int32)
LK2_scran_df = LK2_scran_df[0]

LK2 = np.array(LK2_scran_df)
print(np.shape(LK2))

(25289, 79)


In [6]:
#Fuse LK normalized data
LK_df = pd.concat([LK_scran_df, LK2_scran_df], axis=1)

#Remove genes with non-defined values and genes with only 0
nan_genes = LK_df.any(axis=1)
LK_df = LK_df[nan_genes]

#Fuse families info
y_LK2 += max(y_LK) #Add the max indice of families in first dataset so that there is no overlap between families indices.
y_LK_fuse = np.hstack((y_LK, y_LK2))

In [7]:
#Remove all genes that are not expressed in at least percentage (default 50%) of the LK cells
gene_expressed = filter_norm_data(LK_df,0.05)
LK_df = LK_df[gene_expressed]

#Store the name of genes of interest
genes_interest = LK_df.index.values #get name of genes
genes_interest = pd.DataFrame(genes_interest)
genes_interest.to_csv('../data/processed_data/LKgenes_interest.csv', index=False)

#Convert into array
LK = np.array(LK_df)

#Create preprocess data 
LKcsv = np.c_[LK.T,y_LK_fuse]
LKcsv = pd.DataFrame(LKcsv)
LKcsv.to_csv('../data/processed_data/LK.csv', index=False)

# LSK cells:

In [3]:
#LSK
LSK_scran_df = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LSK_D2_exp1_norm_lifted.csv')
LSK_scran_df = LSK_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LSK_scran_df = LSK_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none


data_families_interest_LSK = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LSK_D2_exp1.RData')
data_families_interest_LSK = data_families_interest_LSK['fois_1']

data_families_info_LSK = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LSK_D2_exp1.RData')
data_families_info_LSK = data_families_info_LSK['WORK_clones']

# conversion into array 
LSK_scran = np.array(LSK_scran_df)
data_families_interest_LSK = np.array(data_families_interest_LSK)
data_families_info_LSK = np.array(data_families_info_LSK)  
data_families_info_LSK[:,0] = data_families_info_LSK[:,0].astype(int)

In [4]:
#Norm data with only the cells belonging to the family of interest
LSK_scran_df = select_family_interest_norm_data(data_families_info_LSK, data_families_interest_LSK, LSK_scran_df)
y_LSK = LSK_scran_df[1]
y_LSK = y_LSK[:,0]
y_LSK = y_LSK.astype(np.int32)
LSK_scran_df = LSK_scran_df[0]

LSK = np.array(LSK_scran_df)
print(np.shape(LSK))

(25289, 781)


In [None]:
#LSK2
LSK2_scran_df = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LSK_D2_exp2_norm_lifted.csv')
LSK2_scran_df = LSK2_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LSK2_scran_df = LSK2_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

data_families_interest_LSK2 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LSK_D2_exp2.RData')
data_families_interest_LSK2= data_families_interest_LSK2['fois_1']

data_families_info_LSK2 = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LSK_D2_exp2.RData')
data_families_info_LSK2 = data_families_info_LSK2['WORK_clones']

# conversion into array 
LSK2_scran = np.array(LSK2_scran_df)
data_families_interest_LSK2 = np.array(data_families_interest_LSK2)
data_families_info_LSK2 = np.array(data_families_info_LSK2)  
data_families_info_LSK2[:,0] = data_families_info_LSK2[:,0].astype(int)

In [None]:
#Norm data with only the cells belonging to the family of interest
LSK2_scran_df = select_family_interest_norm_data(data_families_info_LSK2, data_families_interest_LSK2, LSK2_scran_df)
y_LSK2 = LSK2_scran_df[1]
y_LSK2 = y_LSK2[:,0]
y_LSK2 = y_LSK2.astype(np.int32)
LSK2_scran_df = LSK2_scran_df[0]

LSK2 = np.array(LSK2_scran_df)
print(np.shape(LSK2))

In [None]:
#Fuse LK normalized data
LSK_df = pd.concat([LSK_scran_df, LSK2_scran_df], axis=1)

#Remove genes with non-defined values and genes with only 0
nan_genes = LSK_scran_df.any(axis=1)
LSK_scran_df = LSK_scran_df[nan_genes]

#Fuse families info
y_LSK2 += max(y_LSK) #Add the max indice of families in first dataset so that there is no overlap between families indices.
y_LSK_fuse = np.hstack((y_LSK, y_LSK2))

In [None]:
#Remove all genes that are not expressed in at least percentage (default 50%) of the LSK cells
gene_expressed = filter_norm_data(LSK_df,0.05)
LSK_df = LSK_df[gene_expressed]

#Store the name of genes of interest
genes_interest = LSK_df.index.values #get name of genes
genes_interest = pd.DataFrame(genes_interest)
genes_interest.to_csv('../data/processed_data/LSKgenes_interest.csv', index=False)

#Convert into array
LSK = np.array(LSK_df)

#Create preprocess data 
LSKcsv = np.c_[LSK.T,y_LSK_fuse]
LSKcsv = pd.DataFrame(LSKcsv)
LSKcsv.to_csv('../data/processed_data/LSK.csv', index=False)

In [None]:
print(LSKcsv.shape)

# LK_LSKmix cells:

In [35]:
#LSK-LK mix
LSKmix_scran_df = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LK_LSK_D2_exp3_norm_lifted.csv')
LSKmix_scran_df = LSKmix_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LSKmix_scran_df = LSKmix_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
nan_genes = LSKmix_scran_df.any(axis=1)
LSKmix_scran_df = LSKmix_scran_df[nan_genes]

data_families_interest_LSKmix = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LK_LSK_D2_exp3.RData')
data_families_interest_LSKmix = data_families_interest_LSKmix['fois_1']

data_families_info_LSKmix = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LK_LSK_D2_exp3.RData')
data_families_info_LSKmix = data_families_info_LSKmix['WORK_clones']

# conversion into array 
LSKmix_scran = np.array(LSKmix_scran_df)
data_families_interest_LSKmix = np.array(data_families_interest_LSKmix)
data_families_info_LSKmix = np.array(data_families_info_LSKmix)  
data_families_info_LSKmix[:,0] = data_families_info_LSKmix[:,0].astype(int)

In [36]:
#Norm data with only the cells belonging to the family of interest
LSKmix_scran_df = select_family_interest_norm_data(data_families_info_LSKmix, data_families_interest_LSKmix, LSKmix_scran_df)
y_LSKmix = LSKmix_scran_df[1]
y_LSKmix = y_LSKmix[:,0]
y_LSKmix = y_LSKmix.astype(np.int32)
LSKmix_scran_df = LSKmix_scran_df[0]

LSKmix = np.array(LSKmix_scran_df)
print(np.shape(LSKmix))

(23461, 1023)


In [37]:
#Remove all genes that are not expressed in at least percentage (default 50%) of the LSKmix cells
gene_expressed = filter_norm_data(LSKmix_scran_df,0.05)
LSKmix_scran_df = LSKmix_scran_df[gene_expressed]

#Store the name of genes of interest
genes_interest = LSKmix_scran_df.index.values #get name of genes
genes_interest = pd.DataFrame(genes_interest)
genes_interest.to_csv('../data/processed_data/LSKmixgenes_interest.csv', index=False)

#Convert into array
LSKmix = np.array(LSKmix_scran_df)

#Create preprocess data 
LSKmixcsv = np.c_[LSKmix.T,y_LSKmix]
LSKmixcsv = pd.DataFrame(LSKmixcsv)
LSKmixcsv.to_csv('../data/processed_data/LSKmix.csv', index=False)