# baseGRN evaluation
**Authorship:** Adam Klie (last updated: 08/15/2023)<br>
***
**Description:** This notebook 

# Set-up

In [1]:
# Global imports
import os
import sys
import yaml
import datetime
import logging
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

# Local imports
sys.path.append("/cellar/users/aklie/data/igvf/bin")
from utils import make_dirs

# Celloracle
import celloracle as co

  def twobit_to_dna(twobit: int, size: int) -> str:
  def dna_to_twobit(dna: str) -> int:
  def twobit_1hamming(twobit: int, size: int) -> List[int]:
which: no R in (/cm/local/apps/environment-modules/4.4.0//bin:/cellar/users/aklie/opt/google-cloud-sdk/bin:/cellar/users/aklie/opt/miniconda3/bin:/cellar/users/aklie/opt/miniconda3/condabin:/cellar/users/aklie/opt/meme/bin:/cellar/users/aklie/opt/meme/libexec/meme-5.5.0:/cellar/users/aklie/opt/apache-ant-1.10.12/bin:/cellar/users/aklie/opt/deltasvm_script/deltasvm.pl:/cellar/users/aklie/opt/lsgkm-svr/bin:/cellar/users/aklie/opt/gatk-4.2.6.1:/cellar/users/mpagadal/Programs/PRSICE/PRSice.R:/cellar/users/aklie/opt/plink:/cellar/users/aklie/opt/plink2:/cellar/users/aklie/opt/confusion_matrix:/cellar/users/aklie/bin/motif_finding.sh:/cellar/users/aklie/opt/edirect:/cellar/users/aklie/opt/ucsc:/cellar/users/mpagadal/Programs/bcftools-1.11:/cellar/users/aklie/opt/homer/bin:/cellar/users/aklie/opt/Gene2vec/src:/cellar/users/aklie/opt:/cellar/user

0.14.0


In [2]:
# visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.rcParams['figure.figsize'] = [6, 4.5]
plt.rcParams["savefig.dpi"] = 300

In [3]:
# Dataset
dataset_name = "igvf_sc-islet_10X-Multiome"
results_dir = "/cellar/users/aklie/projects/igvf/beta_cell_networks/infer_grns/celloracle/results"

# Inputs
in_analysis = "base_grn_construction"
in_date = "10Aug23"
in_dir = os.path.join(results_dir, dataset_name, in_date, in_analysis)
print(in_dir)

# Outputs
out_analysis = "base_grn_evaluation"
out_date = "10Aug23"
out_dir = os.path.join(results_dir, dataset_name, out_date, out_analysis)
make_dirs(out_dir)

/cellar/users/aklie/projects/igvf/beta_cell_networks/infer_grns/celloracle/results/igvf_sc-islet_10X-Multiome/10Aug23/base_grn_construction


'/cellar/users/aklie/projects/igvf/beta_cell_networks/infer_grns/celloracle/results/igvf_sc-islet_10X-Multiome/10Aug23/base_grn_evaluation'

In [4]:
# Make and log a yaml file with the parameters
data_params = {
    "dataset_name": dataset_name,
    "results_dir": results_dir,
    "in_analysis": in_analysis,
    "in_date": in_date,
    "in_dir": in_dir,
    "out_analysis": out_analysis,
    "out_date": out_date,
    "out_dir": out_dir,
}
version_params = {
    "Python": sys.version[:5],
    "pandas": pd.__version__,
    "numpy": np.__version__,
    "scanpy": sc.__version__,
    "seaborn": sns.__version__,
    "celloracle": co.__version__,
}
params = {"data": data_params, "versions": version_params}
if not os.path.exists(os.path.join(out_dir, "base_grn_evaluation_params.yaml")):
    with open(os.path.join(out_dir, "base_grn_evaluation_params.yaml"), "w") as f:
        yaml.dump(params, f)
else:
    print("params.yaml already exists, will not overwrite")

# Load data

In [5]:
base_GRN = pd.read_parquet(os.path.join(in_dir, "base_GRN_dataframe.parquet"))

In [6]:
base_GRN

Unnamed: 0,peak_id,gene_short_name,9430076C15RIK,AC002126.6,AC012531.1,AC226150.2,AFP,AHR,AHRR,AIRE,...,ZNF784,ZNF8,ZNF816,ZNF85,ZSCAN10,ZSCAN16,ZSCAN22,ZSCAN26,ZSCAN31,ZSCAN4
0,chr10_100009436_100010350,DNMBP,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,chr10_100081010_100082022,CPN1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,chr10_100185574_100186492,ERLIN1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,chr10_100229112_100229961,CHUK,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,chr10_100267214_100268148,CWF19L1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23429,chrY_3001977_3004457,ZFY-AS1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23430,chrY_4999812_5000682,PCDH11Y,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
23431,chrY_6910276_6911213,TBL1Y,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
23432,chrY_7272200_7273051,PRKY,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [9]:
base_GRN.iloc[:, 2:].sum(axis=0).sort_values(ascending=False).iloc[:20]

EGR1      20338
SP2       19687
SP5       19636
E2F1      19509
SP1       19473
E2F4      19405
PAX5      19256
SP3       19249
NRF1      19141
SP9       19044
THAP1     19017
SP8       19003
ELF1      18935
YY1       18667
ATF3      18607
EP300     18569
SP7       18489
SP6       18438
TAF1      18027
ZBTB7A    17813
dtype: int64