In [None]:
# Author: Victor Curean and Aaron Wenteler

In [1]:
import pandas as pd
import scanpy as sc
import anndata as ad 
import pickle as pkl

In [2]:
def load_significant_perts(file_path):
    df = pd.read_csv(file_path, sep=",")
    df = df[df['significant'] == True]
    return df['Unnamed: 0'].tolist()

In [18]:
def format_norman(adata):
    adata.obs['condition'] = adata.obs['guide_ids'].replace('', 'ctrl')
    adata.obs['condition'] = adata.obs['condition'].str.replace(',', '+')

In [4]:
def preprocess_adata(adata, min_gene_counts=None, min_cell_counts=None, sg=False, no_highly_var=2000):
    """
    Input is an adata object has a condition column with either "ctrl" for negative controls or GENE_SYMBOL for perturbed cells. sg is True when single gene perturbations are used.
    """
    
    adata = adata.copy()

    #filter genes 
    if min_gene_counts is not None:
        sc.pp.filter_genes(adata, min_counts=min_gene_counts)

    #filter cells
    if min_cell_counts is not None:
        sc.pp.filter_cells(adata, min_counts=min_cell_counts)

    #filter only single gene perturbations and controls
    if sg:
        conditions_to_keep = list()
        for cond in list(adata.obs['condition']):
            if "," not in cond:
                conditions_to_keep.append(cond)
        adata = adata[adata.obs['condition'].isin(conditions_to_keep), :]

    #apply preprocessing transformation
    sc.pp.normalize_total(adata, inplace=True)
    sc.pp.log1p(adata)
    sc.pp.highly_variable_genes(adata, n_top_genes=no_highly_var)
    highly_variable_genes = adata.var_names[adata.var['highly_variable']]
    adata = adata[:, highly_variable_genes]

    return adata  

In [13]:
def get_degs_for_adata(adata, covariates, group_by_col='condition', reference='ctrl', save_to_file=None):
    #returns the DEG dataframe containing statistical test results for each gene in covariates compared to the reference group
    
    sc.tl.rank_genes_groups(adata, groupby=group_by_col, reference=reference, groups=covariates, method='wilcoxon', use_raw=False, pts=True)
    deg_df_unfiltered = sc.get.rank_genes_groups_df(adata, group=None)

    #saves the file to skip computation each time
    if save_to_file is not None:
        deg_df_unfiltered.to_csv(save_to_file, sep="\t")

    return deg_df_unfiltered
    

In [6]:
def get_pert_gene_dict(deg_df, ntop_genes, adj_pval_cutoff=0.05):
    deg_df['abs_scores'] = [abs(x) for x in list(deg_df['scores'])]

    if adj_pval_cutoff is not None:
        deg_df = deg_df[deg_df['pvals_adj'] <= adj_pval_cutoff]

    pert_gene_dict = dict()
    
    for group in list(deg_df['group'].unique()):
        df = deg_df[deg_df['group'] == group]
        df = df.nlargest(ntop_genes, 'abs_scores')
        pert_gene_dict[group] = list(df['names'])

    return pert_gene_dict
        

In [19]:
adata = ad.read_h5ad("../data/norman_2019_raw.h5ad")

In [20]:
perturbations_of_interest = load_significant_perts("etest_results_double_perts.csv")
len(perturbations_of_interest)

150

In [21]:
format_norman(adata)

adata_pp = preprocess_adata(adata, min_gene_counts=5, min_cell_counts=None, no_highly_var=2000)
deg_df = get_degs_for_adata(adata_pp, perturbations_of_interest, group_by_col='condition', reference='ctrl')
res = get_pert_gene_dict(deg_df, 20)

  adata.obs['condition'] = adata.obs['guide_ids'].replace('', 'ctrl')
  view_to_actual(adata)
  df[key] = c
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfo

In [23]:
res

{'AHR': ['ENSG00000088992',
  'ENSG00000198886',
  'ENSG00000198899',
  'ENSG00000142669',
  'ENSG00000134321',
  'ENSG00000100097',
  'ENSG00000198938',
  'ENSG00000130203',
  'ENSG00000065978',
  'ENSG00000250361',
  'ENSG00000251562',
  'ENSG00000069482',
  'ENSG00000130429',
  'ENSG00000198763',
  'ENSG00000147454',
  'ENSG00000106546',
  'ENSG00000157404',
  'ENSG00000077063',
  'ENSG00000160789',
  'ENSG00000268621'],
 'AHR+KLF1': ['ENSG00000034510',
  'ENSG00000196565',
  'ENSG00000141744',
  'ENSG00000130656',
  'ENSG00000105610',
  'ENSG00000100097',
  'ENSG00000142227',
  'ENSG00000206172',
  'ENSG00000213934',
  'ENSG00000163191',
  'ENSG00000142669',
  'ENSG00000090013',
  'ENSG00000147454',
  'ENSG00000102575',
  'ENSG00000160789',
  'ENSG00000158869',
  'ENSG00000069482',
  'ENSG00000198886',
  'ENSG00000167815',
  'ENSG00000088992'],
 'ARRDC3': ['ENSG00000279669',
  'ENSG00000167996',
  'ENSG00000268621',
  'ENSG00000187837',
  'ENSG00000245532',
  'ENSG00000008018',
  '

In [26]:
# save result to ../../data/split/perturb/norman/de_test/deg_pert_dict.pkl
with open("../../data/splits/perturb/norman_2/de_test/deg_pert_dict.pkl", "wb") as f:
    pkl.dump(res, f)