In [1]:
import numpy as np
import matplotlib.pyplot as plt

import io 
import pandas as pd 
import pyreadr

from load_data import *

# mESC cells (AE3, AE4)
Let's first load all the data containing the mESC cells, the AE3 and AE4 datasets. Import normalized data, data families interest and data family info

In [65]:
#AE3
AE3_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/AE3_scran_norm.rds')
AE3_scran_df = AE3_scran_df[None]

data_families_interest_AE3 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_AE3_nocellcyclesplit.RData')
data_families_interest_AE3 = data_families_interest_AE3['fois_1']

data_families_info_AE3 = pyreadr.read_r('../data/family_datasets/family_info/family_info_AE3_nocellcyclesplit.RData')
data_families_info_AE3 = data_families_info_AE3['family_info_1']

#Conversion into array 
AE3_scran = np.array(AE3_scran_df)
data_families_interest_AE3 = np.array(data_families_interest_AE3)
data_families_info_AE3 = np.array(data_families_info_AE3) 

In [66]:
#AE4
AE4_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/AE4_scran_norm.rds')
AE4_scran_df = AE4_scran_df[None]

data_families_interest_AE4 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_AE4_nocellcyclesplit.RData')
data_families_interest_AE4 = data_families_interest_AE4['fois_1']

data_families_info_AE4 = pyreadr.read_r('../data/family_datasets/family_info/family_info_AE4_nocellcyclesplit.RData')
data_families_info_AE4 = data_families_info_AE4['family_info_1']

#Conversion into array 
AE4_scran = np.array(AE4_scran_df)
data_families_interest_AE4 = np.array(data_families_interest_AE4)
data_families_info_AE4 = np.array(data_families_info_AE4)  

All cells that don't belong to a family of interest are removed independently in each data set.

In [67]:
#Norm data with only the cells belonging to the family of interest
AE3_scran_df = select_family_interest_norm_data(data_families_info_AE3, data_families_interest_AE3, AE3_scran_df)
y_AE3 = AE3_scran_df[1]
y_AE3 = y_AE3[:,0]
y_AE3 = y_AE3.astype(np.int32)
AE3_scran_df = AE3_scran_df[0]

AE3 = np.array(AE3_scran_df)
print(np.shape(AE3))

(32285, 333)


In [68]:
#Norm data with only the cells belonging to the family of interest
AE4_scran_df = select_family_interest_norm_data(data_families_info_AE4, data_families_interest_AE4, AE4_scran_df)
y_AE4 = AE4_scran_df[1]
y_AE4 = y_AE4[:,0]
y_AE4 = y_AE4.astype(np.int32)
AE4_scran_df = AE4_scran_df[0]

AE4 = np.array(AE4_scran_df)
print(np.shape(AE4))

(32285, 1473)


The data sets are now fused into one.

In [69]:
#Fuse mESC normalized data
mESC_df = pd.concat([AE3_scran_df, AE4_scran_df], axis=1)
#Fuse families info
y_AE4 += max(y_AE3) #Add the max indice of families in first dataset so that there is no overlap between families indices.
y_mESC = np.hstack((y_AE3, y_AE4))

In [72]:
#Remove all genes that are not expressed in at least percentage (default 50%) of the mESC cells
gene_expressed = filter_norm_data(mESC_df)
mESC_df = mESC_df[gene_expressed]

#Store the name of genes of interest
genes_interest = mESC_df.index.values #get name of genes
genes_interest = pd.DataFrame(genes_interest)
genes_interest.to_csv('../data/processed_data/mESCgenes_interest.csv', index=False)

#Convert into array
mESC = np.array(mESC_df)

#Create preprocess data 
mESCcsv = np.c_[mESC.T,y_mESC]
mESCcsv = pd.DataFrame(mESCcsv)
mESCcsv.to_csv('../data/processed_data/mESC.csv', index=False)

# MEF cells (BIDDY DO, DO_2, D6, D6_2, D15, D15_2)
Let's first load all the data containing the MEF cells, the BIDDY DO, DO_2, D6, D6_2, D15, and D15_2 datasets. Import normalized data, data families interest and data family info

In [76]:
#DO
DO_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D0_RPM_norm.rds')
DO_scran_df = DO_scran_df[None]

data_families_interest_DO = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D0.RData')
data_families_interest_DO = data_families_interest_DO['fois_1']

data_families_info_DO = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D0.RData')
data_families_info_DO = data_families_info_DO['family_info_1']

# conversion into array 
DO_scran = np.array(DO_scran_df)
data_families_interest_DO = np.array(data_families_interest_DO)
data_families_info_DO = np.array(data_families_info_DO)  

In [77]:
#DO_2
DO2_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D0_2_RPM_norm.rds')
DO2_scran_df = DO2_scran_df[None]

data_families_interest_DO2 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D0_2.RData')
data_families_interest_DO2 = data_families_interest_DO2['fois_1']

data_families_info_DO2 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D0_2.RData')
data_families_info_DO2 = data_families_info_DO2['family_info_1']

# conversion into array 
DO2_scran = np.array(DO2_scran_df)
data_families_interest_DO2 = np.array(data_families_interest_DO2)
data_families_info_DO2 = np.array(data_families_info_DO2) 

In [78]:
#D6
D6_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D6_RPM_norm.rds')
D6_scran_df = D6_scran_df[None]

data_families_interest_D6 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D6_V2.RData')
data_families_interest_D6 = data_families_interest_D6['fois_1']

data_families_info_D6 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D6_V2.RData')
data_families_info_D6 = data_families_info_D6['family_info_1']

# conversion into array 
D6_scran = np.array(D6_scran_df)
data_families_interest_D6 = np.array(data_families_interest_D6)
data_families_info_D6 = np.array(data_families_info_D6) 

In [79]:
#D6_2
D62_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D6_2_RPM_norm.rds')
D62_scran_df= D62_scran_df[None]

data_families_interest_D62 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D6_2.RData')
data_families_interest_D62 = data_families_interest_D62['fois_1']

data_families_info_D62 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D6_2.RData')
data_families_info_D62 = data_families_info_D62['family_info_1']

# conversion into array 
D62_scran = np.array(D62_scran_df)
data_families_interest_D62 = np.array(data_families_interest_D62)
data_families_info_D62 = np.array(data_families_info_D62)

In [80]:
#D15
D15_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D15_RPM_norm.rds')
D15_scran_df = D15_scran_df[None]

data_families_interest_D15 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D15_V3.RData')
data_families_interest_D15 = data_families_interest_D15['fois_1']

data_families_info_D15 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D15_V3.RData')
data_families_info_D15 = data_families_info_D15['family_info_1']

# conversion into array 
D15_scran = np.array(D15_scran_df)
data_families_interest_D15 = np.array(data_families_interest_D15)
data_families_info_D15 = np.array(data_families_info_D15) 

In [81]:
#D15_2
D152_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/BIDDY_D15_2_RPM_norm.rds')
D152_scran_df = D152_scran_df[None]

data_families_interest_D152 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_BIDDY_D15_2.RData')
data_families_interest_D152 = data_families_interest_D152['fois_1']

data_families_info_D152 = pyreadr.read_r('../data/family_datasets/family_info/family_info_BIDDY_D15_2.RData')
data_families_info_D152 = data_families_info_D152['family_info_1']

# conversion into array 
D152_scran = np.array(D152_scran_df)
data_families_interest_D152 = np.array(data_families_interest_D152)
data_families_info_D152 = np.array(data_families_info_D152)

All cells that don't belong to a family of interest are removed independently in each data set.

In [82]:
#Norm data with only the cells belonging to the family of interest
DO_scran_df = select_family_interest_norm_data(data_families_info_DO, data_families_interest_DO, DO_scran_df)
y_DO = DO_scran_df[1]
y_DO = y_DO[:,0]
y_DO = y_DO.astype(np.int32)
DO_scran_df = DO_scran_df[0]

DO = np.array(DO_scran_df)
print(np.shape(DO))

(32285, 112)


In [83]:
#Norm data with only the cells belonging to the family of interest
DO2_scran_df = select_family_interest_norm_data(data_families_info_DO2, data_families_interest_DO2, DO2_scran_df)
y_DO2 = DO2_scran_df[1]
y_DO2 = y_DO2[:,0]
y_DO2 = y_DO2.astype(np.int32)
DO2_scran_df = DO2_scran_df[0]

DO2 = np.array(DO2_scran_df)
print(np.shape(DO2))

(32285, 349)


In [84]:
#Norm data with only the cells belonging to the family of interest
D6_scran_df = select_family_interest_norm_data(data_families_info_D6, data_families_interest_D6, D6_scran_df)
y_D6 = D6_scran_df[1]
y_D6 = y_D6[:,0]
y_D6 = y_D6.astype(np.int32)
D6_scran_df = D6_scran_df[0]

D6 = np.array(D6_scran_df)
print(np.shape(D6))

(32285, 116)


In [85]:
#Norm data with only the cells belonging to the family of interest
D62_scran_df = select_family_interest_norm_data(data_families_info_D62, data_families_interest_D62, D62_scran_df)
y_D62 = D62_scran_df[1]
y_D62 = y_D62[:,0]
y_D62 = y_D62.astype(np.int32)
D62_scran_df = D62_scran_df[0]

D62 = np.array(D62_scran_df)
print(np.shape(D62))

(32285, 38)


In [86]:
#Norm data with only the cells belonging to the family of interest
D15_scran_df = select_family_interest_norm_data(data_families_info_D15, data_families_interest_D15, D15_scran_df)
y_D15 = D15_scran_df[1]
y_D15 = y_D15[:,0]
y_D15 = y_D15.astype(np.int32)
D15_scran_df = D15_scran_df[0]

D15 = np.array(D15_scran_df)
print(np.shape(D15))

(32285, 88)


In [87]:
#Norm data with only the cells belonging to the family of interest
D152_scran_df = select_family_interest_norm_data(data_families_info_D152, data_families_interest_D152, D152_scran_df)
y_D152 = D152_scran_df[1]
y_D152 = y_D152[:,0]
y_D152 = y_D152.astype(np.int32)
D152_scran_df = D152_scran_df[0]

D152 = np.array(D152_scran_df)
print(np.shape(D152))

(32285, 90)


The data sets are now fused into one.

In [88]:
#Fuse MEF normalized data
MEF_df = pd.concat([DO_scran_df, DO2_scran_df,D6_scran_df,D62_scran_df,D15_scran_df,D152_scran_df], axis=1)

#Fuse families info
y_DO2 += max(y_DO) #Add the max indice of families in first dataset so that there is no overlap between families indices.
y_MEF = np.hstack((y_DO, y_DO2))

y_D6 += max(y_MEF)
y_MEF = np.hstack((y_MEF, y_D6))

y_D62 += max(y_MEF)
y_MEF = np.hstack((y_MEF, y_D62))

y_D15 += max(y_MEF)
y_MEF = np.hstack((y_MEF, y_D15))

y_D152 += max(y_MEF)
y_MEF = np.hstack((y_MEF, y_D152))

In [89]:
#Remove all genes that are not expressed in at least percentage (default 50%) of the MEF cells
gene_expressed = filter_norm_data(MEF_df)
MEF_df = MEF_df[gene_expressed]

#Store the name of genes of interest
genes_interest = MEF_df.index.values #get name of genes
genes_interest = pd.DataFrame(genes_interest)
genes_interest.to_csv('../data/processed_data/MEFgenes_interest.csv', index=False)

#Convert into array
MEF = np.array(MEF_df)

#Create preprocess data 
MEFcsv = np.c_[MEF.T,y_MEF]
MEFcsv = pd.DataFrame(MEFcsv)
MEFcsv.to_csv('../data/processed_data/MEF.csv', index=False)

# T CD8+ cells: KIMMERLING CD8

In [22]:
# import normalized data, data families interest and data family info of Kimmerling CD8 family

df_CD8_norm = pd.read_csv ('../data/family_datasets/data_norm/Kimmerling_CD8_norm.csv')
df_CD8_norm = df_CD8_norm.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
df_CD8_norm = df_CD8_norm.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
nan_genes = df_CD8_norm.any(axis=1)
df_CD8_norm = df_CD8_norm[nan_genes]

df_CD8_fam_info = pyreadr.read_r('../data/family_datasets/family_info/family_info_Kimmerling_CD8.RData')
df_CD8_fam_info = df_CD8_fam_info['family_info_CD8']

df_CD8_fam_interest = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Kimmerling_CD8.RData')
df_CD8_fam_interest = df_CD8_fam_interest['fois_CD8']

# conversion into array 
CD8_norm = np.array(df_CD8_norm)
CD8_fam_interest = np.array(df_CD8_fam_interest)
CD8_fam_info = np.array(df_CD8_fam_info)

In [23]:
#Remove all genes that are not expressed in at least percentage (default 50%) of the cell
gene_expressed = filter_norm_data(CD8_norm)
df_CD8_norm = df_CD8_norm[gene_expressed]

#Norm data with only the cells belonging to the family of interest
df_CD8_norm = select_family_interest_norm_data(CD8_fam_info, CD8_fam_interest, df_CD8_norm)
y = df_CD8_norm[1]
y = y[:,0]
y = y.astype(np.int32)
df_CD8_norm = df_CD8_norm[0]

#Convert into array
CD8 = np.array(df_CD8_norm)

#Store the name of genes of interest
genes_interest = df_CD8_norm.index.values #get name of genes
genes_interest = pd.DataFrame(genes_interest)
genes_interest.to_csv('../data/processed_data/CD8genes_interest.csv', index=False)

#Remove cells not expressing anything (only 0 for each genes)
CD8 = np.delete(CD8,[36,39],1)
y = np.delete(y,[36,39],0)

#Create preprocess data for gihub
CD8csv = np.c_[CD8.T,y]
CD8csv = pd.DataFrame(CD8csv)
CD8csv.to_csv('../data/processed_data/CD8.csv', index=False)

# Leukimia cell line: KIMMERLING L120

In [24]:
# import normalized data, data families interest and data family info of Kimmerling CD8 family

df_CD8_norm = pd.read_csv ('../data/family_datasets/data_norm/Kimmerling_L1210_norm.csv')
df_CD8_norm = df_CD8_norm.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
df_CD8_norm = df_CD8_norm.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
nan_genes = df_CD8_norm.any(axis=1)
df_CD8_norm = df_CD8_norm[nan_genes]

df_CD8_fam_info = pyreadr.read_r('../data/family_datasets/family_info/family_info_Kimmerling_L1210.RData')
df_CD8_fam_info = df_CD8_fam_info['family_info_L1210']

df_CD8_fam_interest = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Kimmerling_L1210.RData')
df_CD8_fam_interest = df_CD8_fam_interest['fois_L1210']


# conversion into array 
CD8_norm = np.array(df_CD8_norm)
CD8_fam_interest = np.array(df_CD8_fam_interest)
CD8_fam_info = np.array(df_CD8_fam_info)

In [25]:
#Remove all genes that are not expressed in at least percentage (default 50%) of the cell
gene_expressed = filter_norm_data(CD8_norm)
df_CD8_norm = df_CD8_norm[gene_expressed]

#Norm data with only the cells belonging to the family of interest
df_CD8_norm = select_family_interest_norm_data(CD8_fam_info, CD8_fam_interest, df_CD8_norm)
y = df_CD8_norm[1]
y = y[:,0]
y = y.astype(np.int32)
df_CD8_norm = df_CD8_norm[0]

#Convert into array
CD8 = np.array(df_CD8_norm)

#Store the name of genes of interest
genes_interest = df_CD8_norm.index.values #get name of genes
genes_interest = pd.DataFrame(genes_interest)
genes_interest.to_csv('../data/processed_data/L120genes_interest.csv', index=False)

#Remove cells not expressing anything (only 0 for each genes)
CD8 = np.delete(CD8,[36,39],1)
y = np.delete(y,[36,39],0)

#Create preprocess data for gihub
CD8csv = np.c_[CD8.T,y]
CD8csv = pd.DataFrame(CD8csv)
CD8csv.to_csv('../data/processed_data/L120.csv', index=False)

# LK and LSK cells: Weinreb data

In [33]:
#LK
LK_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/Weinreb_LK_D2_exp1_norm_lifted.csv')
LK_scran_df = LK_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LK_scran_df = LK_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
nan_genes = LK_scran_df.any(axis=1)
LK_scran_df = LK_scran_df[nan_genes]

data_families_interest_LK = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LK_D2_exp1.RData')
data_families_interest_LK = data_families_interest_LK['WORK_clones']

data_families_info_LK = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LK_D2_exp1.RData')
data_families_info_LK = data_families_info_LK['fois_1']

# conversion into array 
LK_scran = np.array(LK_scran_df)
data_families_interest_LK = np.array(data_families_interest_LK)
data_families_info_LK = np.array(data_families_info_LK)  

In [None]:
#LK2
LK2_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/Weinreb_LK_D2_exp2_norm_lifted.csv')
LK2_scran_df = LK2_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LK2_scran_df = LK2_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
nan_genes = LK2_scran_df.any(axis=1)
LK2_scran_df = LK2_scran_df[nan_genes]

data_families_interest_LK2 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LK_D2_exp2.RData')
data_families_interest_LK2 = data_families_interest_LK2['WORK_clones']

data_families_info_LK2 = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LK_D2_exp2.RData')
data_families_info_LK2 = data_families_info_LK2['fois_1']

# conversion into array 
LK2_scran = np.array(LK2_scran_df)
data_families_interest_LK2 = np.array(data_families_interest_LK2)
data_families_info_LK2 = np.array(data_families_info_LK2) 

In [39]:
#LSK
LSK_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/Weinreb_LSK_D2_exp1_norm_lifted.csv')
LSK_scran_df = LSK_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LSK_scran_df = LSK_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
nan_genes = LSK_scran_df.any(axis=1)
LSK_scran_df = LSK_scran_df[nan_genes]

data_families_interest_LSK = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LSK_D2_exp1.RData')
data_families_interest_LSK = data_families_interest_LSK['WORK_clones']

data_families_info_LSK = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LSK_D2_exp1.RData')
data_families_info_LSK = data_families_info_LSK['fois_1']

# conversion into array 
LSK_scran = np.array(LSK_scran_df)
data_families_interest_LSK = np.array(data_families_interest_LSK)
data_families_info_LSK = np.array(data_families_info_LSK)  

In [39]:
#LSK2
LSK2_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/Weinreb_LSK_D2_exp2_norm_lifted.csv')
LSK2_scran_df = LSK2_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LSK2_scran_df = LSK2_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
nan_genes = LSK2_scran_df.any(axis=1)
LSK2_scran_df = LSK2_scran_df[nan_genes]

data_families_interest_LSK2 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LSK_D2_exp2.RData')
data_families_interest_LSK2= data_families_interest_LSK2['WORK_clones']

data_families_info_LSK2 = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LSK_D2_exp2.RData')
data_families_info_LSK2 = data_families_info_LSK2['fois_1']

# conversion into array 
LSK2_scran = np.array(LSK2_scran_df)
data_families_interest_LSK2 = np.array(data_families_interest_LSK2)
data_families_info_LSK2 = np.array(data_families_info_LSK2)  

In [39]:
#LSK-LK mix
LSKmix_scran_df = pyreadr.read_r('../data/family_datasets/data_norm/Weinreb_LK_LSK_D2_exp3_norm_lifted.csv')
LSKmix_scran_df = LSKmix_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LSKmix_scran_df = LSKmix_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
nan_genes = LSKmix_scran_df.any(axis=1)
LSKmix_scran_df = LSKmix_scran_df[nan_genes]

data_families_interest_LSKmix = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LK_LSK_D2_exp3.RData')
data_families_interest_LSKmix = data_families_interest_LSKmix['WORK_clones']

data_families_info_LSKmix = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LK_LSK_D2_exp3.RData')
data_families_info_LSKmix = data_families_info_LSKmix['fois_1']

# conversion into array 
LSKmix_scran = np.array(LSKmix_scran_df)
data_families_interest_LSKmix = np.array(data_families_interest_LSKmix)
data_families_info_LSKmix = np.array(data_families_info_LSKmix)  