In [2]:
%cd "../"
%pwd

import numpy as np
import matplotlib.pyplot as plt
import io 
import pandas as pd 
import pyreadr

from load_data import *
from pred_score import *

/Users/dormann/Documents/GitHub/src


# LK and LSK cells: Weinreb data

In [33]:
def predict_onlibrary(norm_path:str, family_info_path:str, optimized_genes_path:str):
    
    #Load data
    norm_path = '../data/family_datasets/Weinreb_libraries_norm_lifted/' + norm_path
    norm  = pyreadr.read_r(norm_path)
    norm = norm[None]

    family_info_path = '../data/family_datasets/Family_info/' + family_info_path
    family_info = pyreadr.read_r(family_info_path)
    family_info = np.array(family_info['WORK_clones'])
    
    families, count = np.unique(family_info[:,0], return_counts=True)
    family_interest = families[np.logical_and(count > 1, count < 6)]
    
    #Norm data with only the cells belonging to the family of interest
    norm = select_family_interest_norm_data(family_info, family_interest, norm)
    y  = norm[1][:,0].astype(np.int32)
    norm = norm[0]
    
    #Predict and evaluate
    gene_optimized = np.squeeze(pd.read_csv (optimized_genes_path))

    #Only keep the optimized genes
    print(norm.shape)
    norm = norm[gene_optimized]
    print(norm.shape)

    subset = np.ones((len(gene_optimized),))
    subsets = subsampling_genes(subset, 101, 0.25)
    
    #Predicting once 
    model = FamiliesClusters(np.unique(family),compute_precision,True)
    pred = model.fit_predict(np.array(AE3),np.array(family))
    print(model.score, model.recovery)
    
    #Predicting 101 times and majority vote (cutoff = 0.5)
    model = EnsemblingHierarchical(np.unique(family),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
    result  = model.fit_predict(X = np.array(AE3), y= np.array(family))
    print(model.score, model.recovery)
    

In [34]:
libraries_LK = ['Weinreb_LK_D2_exp1_library_d2_1_norm.rds', 'Weinreb_LK_D2_exp1_library_d2_2_norm.rds', 'Weinreb_LK_D2_exp1_library_d2_3_norm.rds', 'Weinreb_LK_D2_exp1_library_LK_d2_norm.rds', 'Weinreb_LK_D2_exp2_library_d2_1_IV5_norm.rds', 'Weinreb_LK_D2_exp2_library_d2_2_IV5_norm.rds',
'Weinreb_LK_D2_exp2_library_d2_3_IV5_norm.rds', 'Weinreb_LK_D2_exp2_library_d2_4_IV5_norm.rds', 'Weinreb_LK_D2_exp2_library_d2_5_IV5_norm.rds']
family_info_LK = ['family_info_Weinreb_LK_D2_exp1_library_d2_1.RData', 'family_info_Weinreb_LK_D2_exp1_library_d2_2.RData', 'family_info_Weinreb_LK_D2_exp1_library_d2_3.RData', 'family_info_Weinreb_LK_D2_exp1_library_LK_d2.RData', 'family_info_Weinreb_LK_D2_exp2_library_d2_1_IV5.RData', 'family_info_Weinreb_LK_D2_exp2_library_d2_2_IV5.RData',
'family_info_Weinreb_LK_D2_exp2_library_d2_3_IV5.RData','family_info_Weinreb_LK_D2_exp2_library_d2_4_IV5.RData','family_info_Weinreb_LK_D2_exp2_library_d2_5_IV5.RData','']

In [35]:
predict_onlibrary(libraries_LK[0], family_info_LK[0], '../data/optimized_subsets/LKgenes_bestMIM.csv')

[['14' 'CAGGTTGC_TCATCCTT_19478']
 ['20' 'AGGTATAT_AAGATTGT_18765']
 ['21' 'TCAGCCTC_CACAACAG_18341']
 ['21' 'CTCACATC_TCACCGAG_18408']
 ['27' 'CTTTATCC_GTCGTCGT_19621']
 ['77' 'AGGGACTG_CGACTCCT_18541']
 ['77' 'TAGCCACA_ATGTTGGC_18864']
 ['77' 'TAGGTACG_GCTTACCT_19027']
 ['81' 'GGGATTAC_CGGGCTTT_20084']
 ['117' 'TACCCTGC_ACCCATAT_19407']
 ['126' 'AGGCCGAA_GGGAAGGT_19866']
 ['127' 'GCGTATTC_CTTGGTGT_18520']
 ['164' 'AAGGATGA_ACGAAACG_19756']
 ['166' 'CCTCATGA_TGATAACA_19625']
 ['177' 'GTCAGACC_AGATGGCT_19331']
 ['177' 'AGAAAGTG_GTTGTCAT_20047']
 ['187' 'GATGCAGA_GGGAACCT_19389']
 ['187' 'CGAAGAAG_TTAGGACC_19984']
 ['191' 'AAAGCCCG_AGAGACTA_19800']
 ['231' 'CAAGGAAT_TAGTCTAG_18344']
 ['254' 'CCTATTTA_GGGTTGGT_19335']
 ['258' 'GCTAAGTT_CGTGCTAG_19462']
 ['275' 'CTCCTCCA_TGACCAGT_19061']
 ['281' 'TGCACCAG_ATATCTTC_19030']
 ['281' 'CTCGGTGA_ACCACGCT_19883']
 ['284' 'AGAGAGAC_CTTCAGGT_19920']
 ['319' 'CATTTCTA_GCGTTGCT_19202']
 ['337' 'AACGATTT_TGTAGTTT_19385']
 ['339' 'ACGACGAC_GGGAAGGT_19

KeyError: "None of [Index(['ENSMUSG00000098912', 'ENSMUSG00000054676', 'ENSMUSG00000035085',\n       'ENSMUSG00000010277', 'ENSMUSG00000021807', 'ENSMUSG00000076437',\n       'ENSMUSG00000040204', 'ENSMUSG00000052760', 'ENSMUSG00000057191',\n       'ENSMUSG00000058355',\n       ...\n       'ENSMUSG00000064357', 'ENSMUSG00000064351', 'ENSMUSG00000064354',\n       'ENSMUSG00000064358', 'ENSMUSG00000064370', 'ENSMUSG00000064341',\n       'ENSMUSG00000064345', 'ENSMUSG00000064360', 'ENSMUSG00000064363',\n       'ENSMUSG00000064367'],\n      dtype='object', length=770)] are in the [columns]"

# LSK cells:

In [3]:
#LSK
LSK_scran_df = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LSK_D2_exp1_norm_lifted.csv')
LSK_scran_df = LSK_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LSK_scran_df = LSK_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none


data_families_interest_LSK = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LSK_D2_exp1.RData')
data_families_interest_LSK = data_families_interest_LSK['fois_1']

data_families_info_LSK = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LSK_D2_exp1.RData')
data_families_info_LSK = data_families_info_LSK['WORK_clones']

# conversion into array 
LSK_scran = np.array(LSK_scran_df)
data_families_interest_LSK = np.array(data_families_interest_LSK)
data_families_info_LSK = np.array(data_families_info_LSK)  
data_families_info_LSK[:,0] = data_families_info_LSK[:,0].astype(int)

In [4]:
#Norm data with only the cells belonging to the family of interest
LSK_scran_df = select_family_interest_norm_data(data_families_info_LSK, data_families_interest_LSK, LSK_scran_df)
y_LSK = LSK_scran_df[1]
y_LSK = y_LSK[:,0]
y_LSK = y_LSK.astype(np.int32)
LSK_scran_df = LSK_scran_df[0]

LSK = np.array(LSK_scran_df)
print(np.shape(LSK))

(25289, 781)


In [None]:
#LSK2
LSK2_scran_df = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LSK_D2_exp2_norm_lifted.csv')
LSK2_scran_df = LSK2_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LSK2_scran_df = LSK2_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

data_families_interest_LSK2 = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LSK_D2_exp2.RData')
data_families_interest_LSK2= data_families_interest_LSK2['fois_1']

data_families_info_LSK2 = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LSK_D2_exp2.RData')
data_families_info_LSK2 = data_families_info_LSK2['WORK_clones']

# conversion into array 
LSK2_scran = np.array(LSK2_scran_df)
data_families_interest_LSK2 = np.array(data_families_interest_LSK2)
data_families_info_LSK2 = np.array(data_families_info_LSK2)  
data_families_info_LSK2[:,0] = data_families_info_LSK2[:,0].astype(int)

In [None]:
#Norm data with only the cells belonging to the family of interest
LSK2_scran_df = select_family_interest_norm_data(data_families_info_LSK2, data_families_interest_LSK2, LSK2_scran_df)
y_LSK2 = LSK2_scran_df[1]
y_LSK2 = y_LSK2[:,0]
y_LSK2 = y_LSK2.astype(np.int32)
LSK2_scran_df = LSK2_scran_df[0]

LSK2 = np.array(LSK2_scran_df)
print(np.shape(LSK2))

In [None]:
#Fuse LK normalized data
LSK_df = pd.concat([LSK_scran_df, LSK2_scran_df], axis=1)

#Remove genes with non-defined values and genes with only 0
nan_genes = LSK_scran_df.any(axis=1)
LSK_scran_df = LSK_scran_df[nan_genes]

#Fuse families info
y_LSK2 += max(y_LSK) #Add the max indice of families in first dataset so that there is no overlap between families indices.
y_LSK_fuse = np.hstack((y_LSK, y_LSK2))

In [None]:
#Remove all genes that are not expressed in at least percentage (default 50%) of the LSK cells
gene_expressed = filter_norm_data(LSK_df,0.05)
LSK_df = LSK_df[gene_expressed]

#Store the name of genes of interest
genes_interest = LSK_df.index.values #get name of genes
genes_interest = pd.DataFrame(genes_interest)
genes_interest.to_csv('../data/processed_data/LSKgenes_interest.csv', index=False)

#Convert into array
LSK = np.array(LSK_df)

#Create preprocess data 
LSKcsv = np.c_[LSK.T,y_LSK_fuse]
LSKcsv = pd.DataFrame(LSKcsv)
LSKcsv.to_csv('../data/processed_data/LSK.csv', index=False)

In [None]:
print(LSKcsv.shape)

# LK_LSKmix cells:

In [35]:
#LSK-LK mix
LSKmix_scran_df = pd.read_csv('../data/family_datasets/data_norm/Weinreb_LK_LSK_D2_exp3_norm_lifted.csv')
LSKmix_scran_df = LSKmix_scran_df.set_index('Unnamed: 0')     #transform 'Unnamed: 0' column into row label
LSKmix_scran_df = LSKmix_scran_df.rename_axis(None)#change 'Unnamed: 0' row label into none

#Remove genes with non-defined values and genes with only 0
nan_genes = LSKmix_scran_df.any(axis=1)
LSKmix_scran_df = LSKmix_scran_df[nan_genes]

data_families_interest_LSKmix = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_Weinreb_LK_LSK_D2_exp3.RData')
data_families_interest_LSKmix = data_families_interest_LSKmix['fois_1']

data_families_info_LSKmix = pyreadr.read_r('../data/family_datasets/family_info/family_info_Weinreb_LK_LSK_D2_exp3.RData')
data_families_info_LSKmix = data_families_info_LSKmix['WORK_clones']

# conversion into array 
LSKmix_scran = np.array(LSKmix_scran_df)
data_families_interest_LSKmix = np.array(data_families_interest_LSKmix)
data_families_info_LSKmix = np.array(data_families_info_LSKmix)  
data_families_info_LSKmix[:,0] = data_families_info_LSKmix[:,0].astype(int)

In [36]:
#Norm data with only the cells belonging to the family of interest
LSKmix_scran_df = select_family_interest_norm_data(data_families_info_LSKmix, data_families_interest_LSKmix, LSKmix_scran_df)
y_LSKmix = LSKmix_scran_df[1]
y_LSKmix = y_LSKmix[:,0]
y_LSKmix = y_LSKmix.astype(np.int32)
LSKmix_scran_df = LSKmix_scran_df[0]

LSKmix = np.array(LSKmix_scran_df)
print(np.shape(LSKmix))

(23461, 1023)


In [37]:
#Remove all genes that are not expressed in at least percentage (default 50%) of the LSKmix cells
gene_expressed = filter_norm_data(LSKmix_scran_df,0.05)
LSKmix_scran_df = LSKmix_scran_df[gene_expressed]

#Store the name of genes of interest
genes_interest = LSKmix_scran_df.index.values #get name of genes
genes_interest = pd.DataFrame(genes_interest)
genes_interest.to_csv('../data/processed_data/LSKmixgenes_interest.csv', index=False)

#Convert into array
LSKmix = np.array(LSKmix_scran_df)

#Create preprocess data 
LSKmixcsv = np.c_[LSKmix.T,y_LSKmix]
LSKmixcsv = pd.DataFrame(LSKmixcsv)
LSKmixcsv.to_csv('../data/processed_data/LSKmix.csv', index=False)