In [1]:
%cd "../"
%pwd

import numpy as np
import matplotlib.pyplot as plt

import io 
import pandas as pd 
import pyreadr

from load_data import *

C:\Users\alexi\Desktop\Memory_genes\src


In [43]:
def process_norm(name:str, norm_file:str, family_info_file:str, family_interest_file:str, Weinreb:bool=False, Flip:bool=False):

    #Load data
    norm_path = '../data/family_datasets/data_norm/' + norm_file
    norm  = pyreadr.read_r(norm_path)
    norm = norm[None]

    family_info_path = '../data/family_datasets/Family_info/' + family_info_file + '.RData'
    family_info = pyreadr.read_r(family_info_path)
    
    if Weinreb:
        family_info = np.array(family_info['WORK_clones'])
        if Flip: 
            family_info[:, [1, 0]] = family_info[:, [0, 1]]
            
        families, count = np.unique(family_info[:,0], return_counts=True)
        family_interest = families[np.logical_and(count > 1, count < 6)]
    
        #Norm data with only the cells belonging to the family of interest
        cells_interest = []
        for fam in family_interest:
            cell = family_info[fam == family_info[:,0]][:,1]
            cells_interest.append(cell)
        cells_interest = [item for sublist in cells_interest for item in sublist]

        norm = norm.loc[:,cells_interest]
        y = pd.DataFrame(np.zeros((norm.shape[1],)), index= norm.columns)
        family_info = pd.DataFrame(family_info[:,0], index = family_info[:,1])
        y.loc[cells_interest] = family_info.loc[cells_interest]
        
    
    if Weinreb == False:
        family_info = family_info['family_info_1']
        
        families_interest = pyreadr.read_r('../data/family_datasets/family_interest/families_of_interest_AE3_nocellcyclesplit.RData')
        families_interest = families_interest['fois_1']
      
        norm = select_family_interest_norm_data(np.array(family_info), np.array(families_interest), norm)
        y = norm[1][:,0].astype(np.int32)
        norm = norm[0]
    
    #Create preprocess data 
    norm.to_csv('../data/merged_data/' + name + '.csv', index=True)
    pd.DataFrame(y).to_csv('../data/merged_data/y_' + name + '.csv', index=False)

In [44]:
names = ['AE3', 'AE4', 'AE7', 'BIDDY_D0', 'BIDDY_D0_2', 'BIDDY_D6', 'BIDDY_D6_2', 'BIDDY_D15', 'BIDDY_D15_2',
        'LK_D2_exp1_library_d2_1', 'LK_D2_exp1_library_d2_2', 'LK_D2_exp1_library_d2_3', 'LK_LSK_D2_exp3_library_d2_1', 
        'LK_LSK_D2_exp3_library_d2_2', 'LK_LSK_D2_exp3_library_d2_3', 'LK_LSK_D2_exp3_library_d2_4', 
        'LK_LSK_D2_exp3_library_d2_5', 'LK_LSK_D2_exp3_library_d2_6', 'LSK_D2_exp1_library_LSK_d2_1', 'LSK_D2_exp1_library_LSK_d2_2', 'LSK_D2_exp1_library_LSK_d2_3',
       'LSK_D2_exp2_library_d2A_1', 'LSK_D2_exp2_library_d2A_2', 'LSK_D2_exp2_library_d2A_3' , 'LSK_D2_exp2_library_d2A_4', 'LSK_D2_exp2_library_d2A_5', 
       'LSK_D2_exp2_library_d2B_1','LSK_D2_exp2_library_d2B_2', 'LSK_D2_exp2_library_d2B_3', 'LSK_D2_exp2_library_d2B_4', 'LSK_D2_exp2_library_d2B_5', 'LSKmix_exp1_d2_1', 'LSKmix_exp1_d2_2', 'LSKmix_exp1_d2_3', 'LSKmix_exp2_d2_6', 'LSKmix_exp2_d2_4', 'LSKmix_exp2_d2_5']

norm_files = ['AE3_scran_norm.rds', 'AE4_scran_norm.rds', 'AE7_RPM_norm.rds', 'BIDDY_D0_RPM_norm.rds', 'BIDDY_D0_2_RPM_norm.rds', 'BIDDY_D6_RPM_norm.rds', 'BIDDY_D6_2_RPM_norm.rds', 'BIDDY_D15_RPM_norm.rds', 'BIDDY_D15_2_RPM_norm.rds', 
             'Weinreb_LK_D2_exp1_library_d2_1_norm.rds', 'Weinreb_LK_D2_exp1_library_d2_2_norm.rds', 'Weinreb_LK_D2_exp1_library_d2_3_norm.rds', 
             'Weinreb_LK_LSK_D2_exp3_library_d2_1_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_2_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_3_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_4_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_5_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_6_norm.rds', 
             'Weinreb_LSK_D2_exp1_library_LSK_d2_1_norm.rds', 'Weinreb_LSK_D2_exp1_library_LSK_d2_2_norm.rds', 'Weinreb_LSK_D2_exp1_library_LSK_d2_3_norm.rds', 
             'Weinreb_LSK_D2_exp2_library_d2A_1_norm.rds', 'Weinreb_LSK_D2_exp2_library_d2A_2_norm.rds', 'Weinreb_LSK_D2_exp2_library_d2A_3_norm.rds', 'Weinreb_LSK_D2_exp2_library_d2A_4_norm.rds', 'Weinreb_LSK_D2_exp2_library_d2A_5_norm.rds', 
             'Weinreb_LSK_D2_exp2_library_d2B_1_norm.rds', 'Weinreb_LSK_D2_exp2_library_d2B_2_norm.rds', 'Weinreb_LSK_D2_exp2_library_d2B_3_norm.rds', 'Weinreb_LSK_D2_exp2_library_d2B_4_norm.rds', 'Weinreb_LSK_D2_exp2_library_d2B_5_norm.rds', 
              'Weinreb_LK_LSK_D2_exp3_library_d2_1_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_2_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_3_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_4_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_5_norm.rds', 'Weinreb_LK_LSK_D2_exp3_library_d2_6_norm.rds']

family_info_files = ['family_info_AE3_nocellcyclesplit','family_info_AE4_nocellcyclesplit','family_info_AE7_nocellcyclesplit',
                     'family_info_BIDDY_D0','family_info_BIDDY_D0_2','family_info_BIDDY_D6_V2','family_info_BIDDY_D6_2','family_info_BIDDY_D15_V3','family_info_BIDDY_D15_2',
                     'family_info_Weinreb_LK_D2_exp1_library_d2_1','family_info_Weinreb_LK_D2_exp1_library_d2_2','family_info_Weinreb_LK_D2_exp1_library_d2_3',
                     'family_info_Weinreb_LK_LSK_D2_exp3_library_d2_1','family_info_Weinreb_LK_LSK_D2_exp3_library_d2_2','family_info_Weinreb_LK_LSK_D2_exp3_library_d2_3','family_info_Weinreb_LK_LSK_D2_exp3_library_d2_4','family_info_Weinreb_LK_LSK_D2_exp3_library_d2_5','family_info_Weinreb_LK_LSK_D2_exp3_library_d2_6',
                     'family_info_Weinreb_LSK_D2_exp1_library_LSK_d2_1','family_info_Weinreb_LSK_D2_exp1_library_LSK_d2_2','family_info_Weinreb_LSK_D2_exp1_library_LSK_d2_3',
                     'family_info_Weinreb_LSK_D2_exp2_library_d2A_1','family_info_Weinreb_LSK_D2_exp2_library_d2A_2','family_info_Weinreb_LSK_D2_exp2_library_d2A_3','family_info_Weinreb_LSK_D2_exp2_library_d2A_4','family_info_Weinreb_LSK_D2_exp2_library_d2A_5',
                     'family_info_Weinreb_LSK_D2_exp2_library_d2B_1','family_info_Weinreb_LSK_D2_exp2_library_d2B_2','family_info_Weinreb_LSK_D2_exp2_library_d2B_3','family_info_Weinreb_LSK_D2_exp2_library_d2B_4','family_info_Weinreb_LSK_D2_exp2_library_d2B_5'
                    'family_info_Weinreb_LK_LSK_D2_exp3_library_d2_1', 'family_info_Weinreb_LK_LSK_D2_exp3_library_d2_2', 'family_info_Weinreb_LK_LSK_D2_exp3_library_d2_3', 'family_info_Weinreb_LK_LSK_D2_exp3_library_d2_4','family_info_Weinreb_LK_LSK_D2_exp3_library_d2_5', 'family_info_Weinreb_LK_LSK_D2_exp3_library_d2_6']

family_interest_files = ['families_of_interest_AE3_nocellcyclesplit','families_of_interest_AE4_nocellcyclesplit','families_of_interest_AE7_nocellcyclesplit',
                         'families_of_interest_BIDDY_D0','families_of_interest_BIDDY_D0_2','families_of_interest_BIDDY_D6_V2','families_of_interest_BIDDY_D6_2','families_of_interest_BIDDY_D15_V3','families_of_interest_BIDDY_D15_2','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','','']

Weinreb = [0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
Flip = [0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0]

In [45]:
for i in range (len(names)-6,len(names)):
    print(names[i])
    process_norm(names[i], norm_files[i], family_info_files[i], family_interest_files[i], Weinreb[i], Flip[i])

LSKmix_exp1_d2_1


KeyError: "None of [Index(['TACCGCTC_AGGCAGTT_53946', 'GTGCGAAG_TCCTTATT_54634',\n       'CTGAGCGT_CCTGACAC_52743', 'AGCTCCAC_CTTCTTCG_53575',\n       'ATATGCAA_GCTTACCT_52095', 'GGTTAGGG_GGGAGGTA_53137',\n       'GTAATCTG_GGTTAGTG_52628', 'AGCTTCGA_ATATACCT_53107',\n       'CAGTTTAA_AAATCGTT_51988', 'TCCAGAAG_TGCGTATC_54034',\n       ...\n       'TCATTTCA_TTCCGCTC_52377', 'AAACAAAC_GAAGGCTT_53283',\n       'GCGTATTC_AAAGTCGG_52400', 'CTGTTTCC_GCAAGGAC_52824',\n       'GGGCATCA_TCGACGGT_53038', 'ACAGGCCA_AGGAGCTT_53796',\n       'CCTATTTA_CTCGATGC_54183', 'AAAGCCCG_AGCTACGG_54509',\n       'ACAATCTT_GTGTAACC_53425', 'GTACGCTT_TTGTGACT_54388'],\n      dtype='object', length=117)] are in the [columns]"

In [48]:
cells_interest

NameError: name 'cells_interest' is not defined