In [1]:
import os
import sys
import pandas as pd
import numpy as np
from functools import reduce


In [2]:
import sys
sys.path.append('../../../')
sys.path.append('../../infras/cellMix/')
sys.path.append('../../infras/cytof_data/')
sys.path.append('../../infras/')
sys.path.append('../../infras/dashboards/')
sys.path.append('../../experiments/')
sys.path.append('../../experiments/pipeline/')
sys.path.append('../../preprocess/intra_variance/')
sys.path.append('../../models/cell_proportions/')
sys.path.append('../../measures/cell_proportions_measures/')
sys.path.append('../../preprocess/cell_specifics/')

from data_factory import DataFactory
from global_utils import GlobalUtils

In [3]:
intensity_type = "INTENSITY"

In [4]:
def get_per_cell_results(intansity_type,is_sumed_cell_types = "") : 
    de_df = pd.read_csv(f"{intansity_type}_DEP_DE.csv",sep=";")
    de_df = de_df.set_index(["name","ID"])
    
    de_df =  de_df[list(filter(lambda col : "_p.val" in col,de_df.columns))]
    de_df = de_df.applymap(lambda x:float(x.replace(",",".")))

    per_cell_results_dicts = {}
    for col in de_df.head().columns : 
        first,second = col.split("_p.val")[0].split("_vs_")

        if first in per_cell_results_dicts.keys() : 
            per_cell_results_dicts[first][second] = de_df[col]
        else : 
            per_cell_results_dicts[first] = {second : de_df[col]}

        second,first = col.split("_p.val")[0].split("_vs_")

        if first in per_cell_results_dicts.keys() : 
            per_cell_results_dicts[first][second] = de_df[col]
        else : 
            per_cell_results_dicts[first] = {second : de_df[col]}
        
        results = {}
    for cell in per_cell_results_dicts.keys() : 
        per_cel = per_cell_results_dicts[cell]
        results[cell] = pd.DataFrame(per_cel)
    
    return results

def get_sig_genes(per_cell_de,list_of_cells,n_of_genes = 100) : 
    all_genes_list = []
    for cell in list_of_cells : 
        flag = False
        de_df_per_cell = per_cell_de[cell]

        #filter cells not in the list
        de_df_per_cell = de_df_per_cell[[c for c in list_of_cells if c != cell]]

        for pval_trh in np.linspace(0.001,0.1,20):
            _de_df = de_df_per_cell[de_df_per_cell<pval_trh].dropna(how="any")
            if _de_df.shape[0] >= n_of_genes : 
                if _de_df.shape[0] > 2*n_of_genes :
                    all_genes_list.append(_de_df.mean(axis=1).nsmallest(2*n_of_genes).index)
                else : 
                    all_genes_list.append(_de_df.index)

                flag= True
                break
        if not flag : 
            if _de_df.empty:
                all_genes_list.append(de_df_per_cell.mean(axis=1).nsmallest(int(n_of_genes/2)).index)
            else : 
                all_genes_list.append(_de_df.index)

#         print(f"cell : {cell}, pval :{pval_trh},n : {len(all_genes_list[-1])}")
    all_genes = reduce(lambda x,y : x.union(y),all_genes_list)
    return all_genes

per_cell_de = get_per_cell_results(intensity_type)

list_of_cells = [cell for cell in per_cell_de.keys() ][:30]
genes = get_sig_genes(per_cell_de,list_of_cells)

In [5]:
data_factory = DataFactory()
A_all_vs,B_all_vs = data_factory.load_IBD_all_vs("Intensity",log2_transformation=True)

  from ipykernel import kernelapp as app


In [6]:
a_genes = A_all_vs.index

In [7]:
def _build_mapping_between_genes_idxs(genes,a_genes):
    poss = {}
    for gene in genes :
        for a_gene in a_genes : 
            if (a_gene[1] is np.nan) or (gene[1] is np.nan):
                continue
            if  (a_gene[0] in gene[0] ) or (gene[0] in a_gene[0]) or  (gene[0] in a_gene[0] ) or (a_gene[0] in gene[0]):
                if gene not in poss.keys():
                    poss[gene] = []
                poss[gene].append(a_gene)

    genes_to_change = [] 
    for (k,vs) in filter(lambda x: len(x[1]) >1,poss.items()) :
        for v in vs : 
            if (v[0] == k[0]) and (v[1] == k[1]):
                genes_to_change.append((k,v))

    for g in genes_to_change :
        poss[g]=[g]

    final  = {o:n[0] for o,n in poss.items()}
    return final

In [8]:
mapp = _build_mapping_between_genes_idxs(genes,A_all_vs.index)

In [9]:
mapp

{('A0A087WVM2;Q8N6Q3;Q8N6Q3-2', 'CD177'): ('A0A087WVM2', 'CD177'),
 ('A1L0T0;M0R026', 'ILVBL'): ('A1L0T0', 'ILVBL'),
 ('A4D1P6;C9J1X0;A4D1P6-2;A4D1P6-3', 'WDR91'): ('A4D1P6', 'WDR91'),
 ('A6NHR9;A6NHR9-2;J3KTL8', 'SMCHD1'): ('A6NHR9', 'SMCHD1'),
 ('A7E2V4;S4R3H3;S4R393;S4R410;A7E2V4-5;A7E2V4-3;A7E2V4-4;A7E2V4-2;S4R3B3;H7C453',
  'ZSWIM8'): ('A7E2V4', 'ZSWIM8'),
 ('A9Z1X7;Q8IYB3;Q8IYB3-2', 'SRRM1'): ('A9Z1X7', 'SRRM1'),
 ('B1ALD9;Q15063-3;Q15063-4;Q15063-7', 'POSTN'): ('B1ALD9', 'POSTN'),
 ('B4DL54;P49356-2', 'FNTB'): ('P49356', 'FNTB'),
 ('D6RFG8;P27707;D6RCP9', 'DCK'): ('D6RFG8', 'DCK'),
 ('E7ES19;P35443', 'THBS4'): ('E7ES19', 'THBS4'),
 ('E7EVJ5;Q96F07-2;H7C229;E7EW33', 'CYFIP2'): ('Q96F07', 'CYFIP2'),
 ('E9PJ55', 'TCP11L1'): ('E9PJ55', 'TCP11L1'),
 ('F5H5P2;P12694;P12694-2;F5GXU9', 'BCKDHA'): ('F5H5P2', 'BCKDHA'),
 ('G5E9W9', 'GIMAP4'): ('G5E9W9', 'GIMAP4'),
 ('J3KPF0;F8VWT9;Q9Y4D8;Q9Y4D8-4;Q9Y4D8-2', 'HECTD4'): ('J3KPF0', 'HECTD4'),
 ('K7ERI9;P02654;K7EJI9;K7ELM9;K7EPF9', 'APOC1'):

In [10]:
t = mapp.keys()
