In [1]:
%cd "../"
%pwd

import numpy as np
import matplotlib.pyplot as plt
import io 
import pandas as pd 
import pyreadr

from load_data import *
from pred_score import *

/Users/dormann/Documents/GitHub/src


# LK and LSK cells: Weinreb data

In [2]:
def predict_onlibrary(norm_path:str, family_info_path:str, optimized_genes_path:str):

    #Load data
    norm_path = '../data/family_datasets/Weinreb_libraries_norm_lifted/' + norm_path
    norm  = pyreadr.read_r(norm_path)
    norm = norm[None]

    family_info_path = '../data/family_datasets/Family_info/' + family_info_path
    family_info = pyreadr.read_r(family_info_path)
    family_info = np.array(family_info['WORK_clones'])
    
    families, count = np.unique(family_info[:,0], return_counts=True)
    family_interest = families[np.logical_and(count > 1, count < 6)]
    
    #Norm data with only the cells belonging to the family of interest
    cells_interest = []
    for fam in family_interest:
        cell = family_info[fam == family_info[:,0]][:,1]
        cells_interest.append(cell)
    cells_interest = [item for sublist in cells_interest for item in sublist]
        
    norm = norm.loc[:,cells_interest]
    y = pd.DataFrame(np.zeros((norm.shape[1],)), index= norm.columns)
    family_info = pd.DataFrame(family_info[:,0], index = family_info[:,1])
    y.loc[cells_interest] = family_info.loc[cells_interest]
    
    #Predict and evaluate
    gene_optimized = np.squeeze(pd.read_csv (optimized_genes_path))

    #Only keep the optimized genes
    norm = norm.loc[norm.index.intersection(gene_optimized),:].T

    subset = np.ones((norm.shape[1],))
    subsets = subsampling_genes(subset, 101, 0.25)
    
    #Predicting once 
    model = FamiliesClusters(np.unique(y),compute_precision,True)
    pred = model.fit_predict(np.array(norm),np.array(y))
    print('precision: ', model.score_, ' recovery: ', model.recovery)
    
    #Predicting 101 times and majority vote (cutoff = 0.5)
    model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
    result  = model.fit_predict(X = np.array(norm), y= np.array(y))
    print('precision: ', model.score_, ' recovery: ', model.recovery)
    

In [16]:
libraries_LK = ['Weinreb_LK_D2_exp1_library_d2_1_norm.rds', 'Weinreb_LK_D2_exp1_library_d2_2_norm.rds', 'Weinreb_LK_D2_exp1_library_d2_3_norm.rds', 'Weinreb_LK_D2_exp1_library_LK_d2_norm.rds']
family_info_LK = ['family_info_Weinreb_LK_D2_exp1_library_d2_1.RData', 'family_info_Weinreb_LK_D2_exp1_library_d2_2.RData', 'family_info_Weinreb_LK_D2_exp1_library_d2_3.RData', 'family_info_Weinreb_LK_D2_exp1_library_LK_d2.RData']

In [17]:
for i in range(0,len(libraries_LK)):
    print('-----------------------------------------')
    print(libraries_LK[i])
    predict_onlibrary(libraries_LK[i], family_info_LK[i], '../data/optimized_subsets/LKgenes_bestMIM.csv')

-----------------------------------------
Weinreb_LK_D2_exp1_library_d2_1_norm.rds
precision:  0.5161290322580645  recovery:  0.5299145299145299
precision:  0.6086956521739131  recovery:  0.39316239316239315
-----------------------------------------
Weinreb_LK_D2_exp1_library_d2_2_norm.rds
precision:  0.6  recovery:  0.6185567010309279
precision:  0.7560975609756098  recovery:  0.422680412371134
-----------------------------------------
Weinreb_LK_D2_exp1_library_d2_3_norm.rds
precision:  0.2833333333333333  recovery:  0.6091370558375635
precision:  0.4444444444444444  recovery:  0.27411167512690354
-----------------------------------------
Weinreb_LK_D2_exp1_library_LK_d2_norm.rds
precision:  0.4492753623188406  recovery:  0.5655737704918032
precision:  0.5897435897435898  recovery:  0.319672131147541


# LSK cells:

In [18]:
libraries_LSK = ['Weinreb_LSK_D2_exp1_library_LSK_d2_1_norm.rds','Weinreb_LSK_D2_exp1_library_LSK_d2_2_norm.rds','Weinreb_LSK_D2_exp1_library_LSK_d2_3_norm.rds','Weinreb_LSK_D2_exp2_library_d2A_1_norm.rds','Weinreb_LSK_D2_exp2_library_d2A_2_norm.rds',
                 'Weinreb_LSK_D2_exp2_library_d2A_3_norm.rds','Weinreb_LSK_D2_exp2_library_d2A_4_norm.rds','Weinreb_LSK_D2_exp2_library_d2A_5_norm.rds','Weinreb_LSK_D2_exp2_library_d2B_1_norm.rds','Weinreb_LSK_D2_exp2_library_d2B_2_norm.rds','Weinreb_LSK_D2_exp2_library_d2B_3_norm.rds',
                 'Weinreb_LSK_D2_exp2_library_d2B_4_norm.rds', 'Weinreb_LSK_D2_exp2_library_d2B_5_norm.rds']
family_info_LSK = ['family_info_Weinreb_LSK_D2_exp1_library_LSK_d2_1.RData','family_info_Weinreb_LSK_D2_exp1_library_LSK_d2_2.RData','family_info_Weinreb_LSK_D2_exp1_library_LSK_d2_3.RData','family_info_Weinreb_LSK_D2_exp2_library_d2A_1.RData',
                   'family_info_Weinreb_LSK_D2_exp2_library_d2A_2.RData','family_info_Weinreb_LSK_D2_exp2_library_d2A_3.RData','family_info_Weinreb_LSK_D2_exp2_library_d2A_4.RData','family_info_Weinreb_LSK_D2_exp2_library_d2A_5.RData','family_info_Weinreb_LSK_D2_exp2_library_d2B_1.RData',
                   'family_info_Weinreb_LSK_D2_exp2_library_d2B_2.RData','family_info_Weinreb_LSK_D2_exp2_library_d2B_3.RData','family_info_Weinreb_LSK_D2_exp2_library_d2B_4.RData','family_info_Weinreb_LSK_D2_exp2_library_d2B_5.RData']

In [19]:
for i in range(0,len(libraries_LSK)):
    print('-----------------------------------------')
    print(libraries_LSK[i])
    predict_onlibrary(libraries_LSK[i], family_info_LSK[i], '../data/optimized_subsets/LSKgenes_bestANOVA.csv')

-----------------------------------------
Weinreb_LSK_D2_exp1_library_LSK_d2_1_norm.rds
precision:  0.10317460317460317  recovery:  0.6961325966850829
precision:  0.25  recovery:  0.15469613259668508
-----------------------------------------
Weinreb_LSK_D2_exp1_library_LSK_d2_2_norm.rds
precision:  0.12121212121212122  recovery:  0.72
precision:  0.34  recovery:  0.18181818181818182
-----------------------------------------
Weinreb_LSK_D2_exp1_library_LSK_d2_3_norm.rds
precision:  0.1440677966101695  recovery:  0.7564102564102564
precision:  0.37037037037037035  recovery:  0.17307692307692307
-----------------------------------------
Weinreb_LSK_D2_exp2_library_d2A_1_norm.rds


ValueError: The number of observations cannot be determined on an empty distance matrix.

# LK/LSK mix

In [20]:
libraries_LSKmix = ['Weinreb_LK_LSK_D2_exp3_library_d2_1_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_2_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_3_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_4_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_5_norm.rds', 
                    'Weinreb_LK_LSK_D2_exp3_library_d2_6_norm.rds']
family_info_LSKmix = ['family_info_Weinreb_LK_LSK_D2_exp3_library_d2_1.RData','family_info_Weinreb_LK_LSK_D2_exp3_library_d2_2.RData','family_info_Weinreb_LK_LSK_D2_exp3_library_d2_3.RData','family_info_Weinreb_LK_LSK_D2_exp3_library_d2_4.RData',
                      'family_info_Weinreb_LK_LSK_D2_exp3_library_d2_5.RData', 'family_info_Weinreb_LK_LSK_D2_exp3_library_d2_6.RData']

In [22]:
for i in range(0,len(libraries_LSK)):
    print('-----------------------------------------')
    print(libraries_LSKmix[i])
    predict_onlibrary(libraries_LSKmix[1], family_info_LSKmix[1], '../data/optimized_subsets/LSKmixgenes_bestANOVA.csv')

-----------------------------------------
Weinreb_LK_LSK_D2_exp3_library_d2_1_norm.rds


ValueError: The number of observations cannot be determined on an empty distance matrix.