# pycisTopic analysis

Full dataset, using SCREEN regions.

In [1]:
import pycisTopic

pycisTopic.__version__

'1.0.1.dev21+g8aa75d8.d20220628'

In [2]:
import warnings

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore")

In [3]:
import pickle
import pandas as pd

In [4]:
import glob
from collections import OrderedDict
import numpy as np

%load_ext lab_black

In [5]:
!pwd

/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series


In [6]:
import os

wdir = "/lustre1/project/stg_00090/scatac_benchmark/fixedcells_downsample_series"
os.chdir(wdir)

In [7]:
cto_paths = sorted(glob.glob(f"*k/cistopic_objects/*cto.pkl"))
cistopic_obj_path_dict = {x.split("/")[-1].split(f"__")[0]: x for x in cto_paths}
cistopic_obj_path_dict

{'BIO_ddseq_1.10k': '10k/cistopic_objects/BIO_ddseq_1.10k__cto.pkl',
 'BIO_ddseq_2.10k': '10k/cistopic_objects/BIO_ddseq_2.10k__cto.pkl',
 'BIO_ddseq_3.10k': '10k/cistopic_objects/BIO_ddseq_3.10k__cto.pkl',
 'BIO_ddseq_4.10k': '10k/cistopic_objects/BIO_ddseq_4.10k__cto.pkl',
 'BRO_mtscatac_1.10k': '10k/cistopic_objects/BRO_mtscatac_1.10k__cto.pkl',
 'BRO_mtscatac_2.10k': '10k/cistopic_objects/BRO_mtscatac_2.10k__cto.pkl',
 'CNA_10xmultiome_1.10k': '10k/cistopic_objects/CNA_10xmultiome_1.10k__cto.pkl',
 'CNA_10xmultiome_2.10k': '10k/cistopic_objects/CNA_10xmultiome_2.10k__cto.pkl',
 'CNA_10xv11_1.10k': '10k/cistopic_objects/CNA_10xv11_1.10k__cto.pkl',
 'CNA_10xv11_2.10k': '10k/cistopic_objects/CNA_10xv11_2.10k__cto.pkl',
 'CNA_10xv11_3.10k': '10k/cistopic_objects/CNA_10xv11_3.10k__cto.pkl',
 'CNA_10xv11_4.10k': '10k/cistopic_objects/CNA_10xv11_4.10k__cto.pkl',
 'CNA_10xv11_5.10k': '10k/cistopic_objects/CNA_10xv11_5.10k__cto.pkl',
 'CNA_10xv2_1.10k': '10k/cistopic_objects/CNA_10xv2_1.10k

In [8]:
n_topics_final = 20
topics_final = set([str(x + 1) for x in list(range(1, n_topics_final))])
n_cores = 36
n_iter = "400"

In [9]:
mounts = "/lustre1,/staging,/data,/vsc-hard-mounts,/scratch"
sif = "../0_resources/cistopic_image/20220815_pycistopic.sif"
script = "../0_resources/scripts/runModels_lda_mallet.py"

f_mod_dir = "models__screen__mallet"
if not os.path.exists(os.path.join(wdir, f_mod_dir)):
    os.makedirs(os.path.join(wdir, f_mod_dir))

for sample in cistopic_obj_path_dict.keys():
    infile = cistopic_obj_path_dict[sample]
    outfile = os.path.join(f_mod_dir, sample + "__models_" + str(n_iter) + "_iter.pkl")

    intermediate_dir = f"{f_mod_dir}/model_intermediates/{sample}_model_intermediates/"

    if not os.path.exists(os.path.join(wdir, intermediate_dir)):
        os.makedirs(os.path.join(wdir, intermediate_dir))
    else:
        print(f"{intermediate_dir} already exists!")

    n_topics_detected = len(os.listdir(os.path.join(wdir, intermediate_dir)))
    if not n_topics_detected == n_topics_final:
        model_tmp_dir = f"{f_mod_dir}/model_tmp/{sample}_model_tmp/"
        if not os.path.exists(os.path.join(wdir, model_tmp_dir)):
            os.makedirs(os.path.join(wdir, model_tmp_dir))

        topics_detected = set(
            [
                x.replace("Topic", "").replace(".pkl", "")
                for x in os.listdir(os.path.join(wdir, intermediate_dir))
            ]
        )
        topics_todo = (
            str(sorted(list(topics_final - topics_detected)))
            .replace("'", "")
            .replace(" ", "")
            .replace("[", "")
            .replace("]", "")
        )
        ver = str(n_topics_final)

        if not topics_todo == "":
            out_log = f"{f_mod_dir}/{sample}.models_out_log{ver}.txt"
            cmd = f"echo {sample} && cd {wdir} && singularity exec -B {mounts} {sif} python {script} -i {infile} -o {outfile} -nt {topics_todo} -c {n_cores} -it {n_iter} -a 50 -abt True -e 0.1 -ebt False -sp {intermediate_dir} -s 555 -td {model_tmp_dir} > {out_log}"

            print(cmd)
            sh_path = f"{f_mod_dir}/{sample}.runmodels{ver}.sh"
            with open(sh_path, "w") as file:
                file.write("#!/bin/sh\n")
                file.write(cmd)
        else:
            print("All topics done.")

    print("\n")

models__screen__mallet/model_intermediates/BIO_ddseq_1.10k_model_intermediates/ already exists!
All topics done.


models__screen__mallet/model_intermediates/BIO_ddseq_2.10k_model_intermediates/ already exists!
All topics done.


models__screen__mallet/model_intermediates/BIO_ddseq_3.10k_model_intermediates/ already exists!
All topics done.


models__screen__mallet/model_intermediates/BIO_ddseq_4.10k_model_intermediates/ already exists!
All topics done.


models__screen__mallet/model_intermediates/BRO_mtscatac_1.10k_model_intermediates/ already exists!
All topics done.


models__screen__mallet/model_intermediates/BRO_mtscatac_2.10k_model_intermediates/ already exists!
All topics done.


models__screen__mallet/model_intermediates/CNA_10xmultiome_1.10k_model_intermediates/ already exists!
All topics done.


models__screen__mallet/model_intermediates/CNA_10xmultiome_2.10k_model_intermediates/ already exists!
All topics done.


models__screen__mallet/model_intermediates/CNA_10xv11_1.10k_mo

In [10]:
!cat ../0_resources/scripts/runModels_lda_mallet.py

import pickle
import sys
import argparse
import os
from pycisTopic.cistopic_class import *
from pycisTopic.lda_models import *

def make_argument_parser():
    """
    Creates an ArgumentParser to read the options for this script from
    sys.argv
    """
    parser = argparse.ArgumentParser(
        description="Run topic models.",)
    parser.add_argument('--inputcisTopic_obj', '-i', type=str, required=True,
                        help='Path to cisTopic object pickle file.')
    parser.add_argument('--output', '-o', type=str, required=True,
                        help='Path to save final model list.')
    parser.add_argument('--n_topics', '-nt', type=str, required=True, nargs='+',
                        help='Txt file containing selected topic id.')
    parser.add_argument('--n_cpu', '-c', type=int, required=True,
                        help = 'Number of cores')
    parser.add_argument('--n_iter', '-it', type=int, required=False, default=150,
                        help = 'Numbe

In shell:

```
for script in models__screen__mallet/*20.sh
do
    echo $script
    sbatch -p --job-name=models --cluster=wice --partition=dedicated_big_bigmem -n 1 -A lp_big_wice_cpu --time=6:00:00 --ntasks=1 --cpus-per-task=24 --mem=300GB $script
done
```

Then, combine the models:

In [11]:
intermediate_dict = {
    x.split("/")[-1].split("_model_intermediates")[0]: x
    for x in sorted(glob.glob(f"{f_mod_dir}/model_intermediates/*"))
}
intermediate_dict

{'BIO_ddseq_1.10k': 'models__screen__mallet/model_intermediates/BIO_ddseq_1.10k_model_intermediates',
 'BIO_ddseq_1.15k': 'models__screen__mallet/model_intermediates/BIO_ddseq_1.15k_model_intermediates',
 'BIO_ddseq_1.20k': 'models__screen__mallet/model_intermediates/BIO_ddseq_1.20k_model_intermediates',
 'BIO_ddseq_1.25k': 'models__screen__mallet/model_intermediates/BIO_ddseq_1.25k_model_intermediates',
 'BIO_ddseq_1.30k': 'models__screen__mallet/model_intermediates/BIO_ddseq_1.30k_model_intermediates',
 'BIO_ddseq_1.35k': 'models__screen__mallet/model_intermediates/BIO_ddseq_1.35k_model_intermediates',
 'BIO_ddseq_1.5k': 'models__screen__mallet/model_intermediates/BIO_ddseq_1.5k_model_intermediates',
 'BIO_ddseq_2.10k': 'models__screen__mallet/model_intermediates/BIO_ddseq_2.10k_model_intermediates',
 'BIO_ddseq_2.15k': 'models__screen__mallet/model_intermediates/BIO_ddseq_2.15k_model_intermediates',
 'BIO_ddseq_2.20k': 'models__screen__mallet/model_intermediates/BIO_ddseq_2.20k_mode

In [21]:
for sample, directory in intermediate_dict.items():
    nmodels = len(os.listdir(directory))
    print(f"{nmodels} models in {directory}")
    save_path = f"{f_mod_dir}/{sample}__models_{n_iter}_iter.pkl"
    if not os.path.exists(save_path):
        file_list = sorted(os.listdir(directory))
        n_topics = [int(x.split("Topic")[-1].split(".pkl")[0]) for x in file_list]
        file_list_sorted = [file_list[x] for x in np.argsort(n_topics)]

        if len(os.listdir(directory)) == n_topics_final - 1:
            print(f"\tsaving at {save_path}")

            models_merged = [
                pickle.load(open(f"{directory}/{file}", "rb"))
                for file in file_list_sorted
            ]

            with open(save_path, "wb") as f:
                pickle.dump(models_merged, f, protocol=4)

        else:
            print(
                f"\tn_models < {n_topics_final}, {len(sorted(os.listdir(directory)))}"
            )
    else:
        print(f"\t{save_path} exists, skipping")

19 models in models__screen__mallet/model_intermediates/BIO_ddseq_1.10k_model_intermediates
	saving at models__screen__mallet/BIO_ddseq_1.10k__models_400_iter.pkl
19 models in models__screen__mallet/model_intermediates/BIO_ddseq_1.15k_model_intermediates
	saving at models__screen__mallet/BIO_ddseq_1.15k__models_400_iter.pkl
19 models in models__screen__mallet/model_intermediates/BIO_ddseq_1.20k_model_intermediates
	saving at models__screen__mallet/BIO_ddseq_1.20k__models_400_iter.pkl
19 models in models__screen__mallet/model_intermediates/BIO_ddseq_1.25k_model_intermediates
	saving at models__screen__mallet/BIO_ddseq_1.25k__models_400_iter.pkl
19 models in models__screen__mallet/model_intermediates/BIO_ddseq_1.30k_model_intermediates
	saving at models__screen__mallet/BIO_ddseq_1.30k__models_400_iter.pkl
19 models in models__screen__mallet/model_intermediates/BIO_ddseq_1.35k_model_intermediates
	saving at models__screen__mallet/BIO_ddseq_1.35k__models_400_iter.pkl
19 models in models__s

In [22]:
from pycisTopic.lda_models import evaluate_models
import matplotlib.pyplot as plt

In [23]:
cto_paths = sorted(glob.glob(f"*k/cistopic_objects/*k__cto.pkl"))
cto_singlets_path_dict = {x.split("/")[-1].split(f"__")[0]: x for x in cto_paths}
cto_singlets_path_dict

{'BIO_ddseq_1.10k': '10k/cistopic_objects/BIO_ddseq_1.10k__cto.pkl',
 'BIO_ddseq_2.10k': '10k/cistopic_objects/BIO_ddseq_2.10k__cto.pkl',
 'BIO_ddseq_3.10k': '10k/cistopic_objects/BIO_ddseq_3.10k__cto.pkl',
 'BIO_ddseq_4.10k': '10k/cistopic_objects/BIO_ddseq_4.10k__cto.pkl',
 'BRO_mtscatac_1.10k': '10k/cistopic_objects/BRO_mtscatac_1.10k__cto.pkl',
 'BRO_mtscatac_2.10k': '10k/cistopic_objects/BRO_mtscatac_2.10k__cto.pkl',
 'CNA_10xmultiome_1.10k': '10k/cistopic_objects/CNA_10xmultiome_1.10k__cto.pkl',
 'CNA_10xmultiome_2.10k': '10k/cistopic_objects/CNA_10xmultiome_2.10k__cto.pkl',
 'CNA_10xv11_1.10k': '10k/cistopic_objects/CNA_10xv11_1.10k__cto.pkl',
 'CNA_10xv11_2.10k': '10k/cistopic_objects/CNA_10xv11_2.10k__cto.pkl',
 'CNA_10xv11_3.10k': '10k/cistopic_objects/CNA_10xv11_3.10k__cto.pkl',
 'CNA_10xv11_4.10k': '10k/cistopic_objects/CNA_10xv11_4.10k__cto.pkl',
 'CNA_10xv11_5.10k': '10k/cistopic_objects/CNA_10xv11_5.10k__cto.pkl',
 'CNA_10xv2_1.10k': '10k/cistopic_objects/CNA_10xv2_1.10k

In [24]:
n_iter = 400

In [25]:
models_path_dict = {
    x.split("/")[-1].split(f"__models_{n_iter}_iter.pkl")[0]: x
    for x in sorted(glob.glob(f"{f_mod_dir}/*{n_iter}_iter.pkl"))
}
models_path_dict

{'BIO_ddseq_1.10k': 'models__screen__mallet/BIO_ddseq_1.10k__models_400_iter.pkl',
 'BIO_ddseq_1.15k': 'models__screen__mallet/BIO_ddseq_1.15k__models_400_iter.pkl',
 'BIO_ddseq_1.20k': 'models__screen__mallet/BIO_ddseq_1.20k__models_400_iter.pkl',
 'BIO_ddseq_1.25k': 'models__screen__mallet/BIO_ddseq_1.25k__models_400_iter.pkl',
 'BIO_ddseq_1.30k': 'models__screen__mallet/BIO_ddseq_1.30k__models_400_iter.pkl',
 'BIO_ddseq_1.35k': 'models__screen__mallet/BIO_ddseq_1.35k__models_400_iter.pkl',
 'BIO_ddseq_1.5k': 'models__screen__mallet/BIO_ddseq_1.5k__models_400_iter.pkl',
 'BIO_ddseq_2.10k': 'models__screen__mallet/BIO_ddseq_2.10k__models_400_iter.pkl',
 'BIO_ddseq_2.15k': 'models__screen__mallet/BIO_ddseq_2.15k__models_400_iter.pkl',
 'BIO_ddseq_2.20k': 'models__screen__mallet/BIO_ddseq_2.20k__models_400_iter.pkl',
 'BIO_ddseq_2.25k': 'models__screen__mallet/BIO_ddseq_2.25k__models_400_iter.pkl',
 'BIO_ddseq_2.30k': 'models__screen__mallet/BIO_ddseq_2.30k__models_400_iter.pkl',
 'BIO_

We need to minimize Arun, minimize Cao & Juan, maximize Mimno & maximize log likelihood

In [31]:
optimal_topics_dict = {
    "BIO_ddseq_1.10k": 11,
    "BIO_ddseq_1.15k": 16,
    "BIO_ddseq_1.20k": 18,
    "BIO_ddseq_1.25k": 18,
    "BIO_ddseq_1.30k": 16,
    "BIO_ddseq_1.35k": 17,
    "BIO_ddseq_1.5k": 17,
    "BIO_ddseq_2.10k": 18,
    "BIO_ddseq_2.15k": 18,
    "BIO_ddseq_2.20k": 17,
    "BIO_ddseq_2.25k": 17,
    "BIO_ddseq_2.30k": 17,
    "BIO_ddseq_2.35k": 18,
    "BIO_ddseq_2.5k": 14,
    "BIO_ddseq_3.10k": 11,
    "BIO_ddseq_3.15k": 13,
    "BIO_ddseq_3.20k": 8,
    "BIO_ddseq_3.25k": 10,
    "BIO_ddseq_3.30k": 14,
    "BIO_ddseq_3.35k": 10,
    "BIO_ddseq_3.5k": 17,
    "BIO_ddseq_4.10k": 9,
    "BIO_ddseq_4.15k": 15,
    "BIO_ddseq_4.20k": 12,
    "BIO_ddseq_4.25k": 17,
    "BIO_ddseq_4.30k": 12,
    "BIO_ddseq_4.35k": 11,
    "BIO_ddseq_4.5k": 14,
    "BRO_mtscatac_1.10k": 11,
    "BRO_mtscatac_1.15k": 12,
    "BRO_mtscatac_1.20k": 10,
    "BRO_mtscatac_1.25k": 11,
    "BRO_mtscatac_1.30k": 9,
    "BRO_mtscatac_1.35k": 11,
    "BRO_mtscatac_1.5k": 12,
    "BRO_mtscatac_2.10k": 12,
    "BRO_mtscatac_2.15k": 10,
    "BRO_mtscatac_2.20k": 13,
    "BRO_mtscatac_2.25k": 11,
    "BRO_mtscatac_2.30k": 12,
    "BRO_mtscatac_2.35k": 12,
    "BRO_mtscatac_2.5k": 10,
    "CNA_10xmultiome_1.10k": 9,
    "CNA_10xmultiome_1.15k": 12,
    "CNA_10xmultiome_1.20k": 10,
    "CNA_10xmultiome_1.25k": 13,
    "CNA_10xmultiome_1.30k": 9,
    "CNA_10xmultiome_1.35k": 11,
    "CNA_10xmultiome_1.5k": 12,
    "CNA_10xmultiome_2.10k": 15,
    "CNA_10xmultiome_2.15k": 15,
    "CNA_10xmultiome_2.20k": 14,
    "CNA_10xmultiome_2.25k": 12,
    "CNA_10xmultiome_2.30k": 14,
    "CNA_10xmultiome_2.35k": 13,
    "CNA_10xmultiome_2.5k": 15,
    "CNA_10xv11_1.10k": 10,
    "CNA_10xv11_1.15k": 10,
    "CNA_10xv11_1.20k": 10,
    "CNA_10xv11_1.25k": 10,
    "CNA_10xv11_1.30k": 13,
    "CNA_10xv11_1.35k": 12,
    "CNA_10xv11_1.5k": 8,
    "CNA_10xv11_2.10k": 17,
    "CNA_10xv11_2.15k": 13,
    "CNA_10xv11_2.20k": 11,
    "CNA_10xv11_2.25k": 11,
    "CNA_10xv11_2.30k": 15,
    "CNA_10xv11_2.35k": 17,
    "CNA_10xv11_2.5k": 12,
    "CNA_10xv11_3.10k": 11,
    "CNA_10xv11_3.15k": 15,
    "CNA_10xv11_3.20k": 13,
    "CNA_10xv11_3.25k": 14,
    "CNA_10xv11_3.30k": 11,
    "CNA_10xv11_3.35k": 13,
    "CNA_10xv11_3.5k": 12,
    "CNA_10xv11_4.10k": 12,
    "CNA_10xv11_4.15k": 11,
    "CNA_10xv11_4.20k": 7,
    "CNA_10xv11_4.25k": 16,
    "CNA_10xv11_4.30k": 16,
    "CNA_10xv11_4.35k": 12,
    "CNA_10xv11_4.5k": 7,
    "CNA_10xv11_5.10k": 12,
    "CNA_10xv11_5.15k": 11,
    "CNA_10xv11_5.20k": 8,
    "CNA_10xv11_5.25k": 9,
    "CNA_10xv11_5.30k": 11,
    "CNA_10xv11_5.35k": 12,
    "CNA_10xv11_5.5k": 11,
    "CNA_10xv2_1.10k": 11,
    "CNA_10xv2_1.15k": 9,
    "CNA_10xv2_1.20k": 12,
    "CNA_10xv2_1.25k": 11,
    "CNA_10xv2_1.30k": 12,
    "CNA_10xv2_1.35k": 15,
    "CNA_10xv2_1.5k": 14,
    "CNA_10xv2_2.10k": 11,
    "CNA_10xv2_2.15k": 8,
    "CNA_10xv2_2.5k": 12,
    "CNA_hydrop_1.10k": 20,
    "CNA_hydrop_1.15k": 9,
    "CNA_hydrop_1.20k": 10,
    "CNA_hydrop_1.25k": 15,
    "CNA_hydrop_1.30k": 18,
    "CNA_hydrop_1.35k": 7,
    "CNA_hydrop_1.5k": 16,
    "CNA_hydrop_2.10k": 7,
    "CNA_hydrop_2.15k": 10,
    "CNA_hydrop_2.20k": 10,
    "CNA_hydrop_2.25k": 8,
    "CNA_hydrop_2.30k": 7,
    "CNA_hydrop_2.35k": 6,
    "CNA_hydrop_2.5k": 9,
    "CNA_hydrop_3.10k": 17,
    "CNA_hydrop_3.15k": 13,
    "CNA_hydrop_3.20k": 18,
    "CNA_hydrop_3.25k": 16,
    "CNA_hydrop_3.30k": 14,
    "CNA_hydrop_3.35k": 17,
    "CNA_hydrop_3.5k": 10,
    "CNA_mtscatac_1.10k": 8,
    "CNA_mtscatac_1.15k": 8,
    "CNA_mtscatac_1.20k": 10,
    "CNA_mtscatac_1.25k": 11,
    "CNA_mtscatac_1.30k": 11,
    "CNA_mtscatac_1.35k": 16,
    "CNA_mtscatac_1.5k": 15,
    "CNA_mtscatac_2.10k": 8,
    "CNA_mtscatac_2.15k": 13,
    "CNA_mtscatac_2.20k": 8,
    "CNA_mtscatac_2.25k": 7,
    "CNA_mtscatac_2.30k": 9,
    "CNA_mtscatac_2.35k": 7,
    "CNA_mtscatac_2.5k": 10,
    "EPF_hydrop_1.10k": 13,
    "EPF_hydrop_1.15k": 15,
    "EPF_hydrop_1.20k": 16,
    "EPF_hydrop_1.25k": 14,
    "EPF_hydrop_1.30k": 10,
    "EPF_hydrop_1.35k": 12,
    "EPF_hydrop_1.5k": 9,
    "EPF_hydrop_2.10k": 10,
    "EPF_hydrop_2.15k": 19,
    "EPF_hydrop_2.20k": 10,
    "EPF_hydrop_2.25k": 8,
    "EPF_hydrop_2.30k": 16,
    "EPF_hydrop_2.35k": 14,
    "EPF_hydrop_2.5k": 8,
    "EPF_hydrop_3.10k": 10,
    "EPF_hydrop_3.15k": 9,
    "EPF_hydrop_3.20k": 6,
    "EPF_hydrop_3.25k": 9,
    "EPF_hydrop_3.30k": 11,
    "EPF_hydrop_3.35k": 10,
    "EPF_hydrop_3.5k": 7,
    "EPF_hydrop_4.10k": 12,
    "EPF_hydrop_4.15k": 16,
    "EPF_hydrop_4.20k": 14,
    "EPF_hydrop_4.25k": 12,
    "EPF_hydrop_4.30k": 11,
    "EPF_hydrop_4.35k": 18,
    "EPF_hydrop_4.5k": 10,
    "HAR_ddseq_1.10k": 14,
    "HAR_ddseq_1.15k": 12,
    "HAR_ddseq_1.20k": 12,
    "HAR_ddseq_1.25k": 19,
    "HAR_ddseq_1.30k": 19,
    "HAR_ddseq_1.35k": 14,
    "HAR_ddseq_1.5k": 9,
    "HAR_ddseq_2.10k": 19,
    "HAR_ddseq_2.15k": 19,
    "HAR_ddseq_2.20k": 16,
    "HAR_ddseq_2.25k": 11,
    "HAR_ddseq_2.30k": 16,
    "HAR_ddseq_2.35k": 15,
    "HAR_ddseq_2.5k": 13,
    "MDC_mtscatac_1.10k": 14,
    "MDC_mtscatac_1.15k": 14,
    "MDC_mtscatac_1.20k": 16,
    "MDC_mtscatac_1.5k": 17,
    "MDC_mtscatac_2.10k": 11,
    "MDC_mtscatac_2.15k": 12,
    "MDC_mtscatac_2.20k": 17,
    "MDC_mtscatac_2.25k": 18,
    "MDC_mtscatac_2.30k": 18,
    "MDC_mtscatac_2.35k": 13,
    "MDC_mtscatac_2.5k": 8,
    "OHS_s3atac_1.10k": 17,
    "OHS_s3atac_1.15k": 13,
    "OHS_s3atac_1.20k": 13,
    "OHS_s3atac_1.25k": 12,
    "OHS_s3atac_1.30k": 11,
    "OHS_s3atac_1.35k": 17,
    "OHS_s3atac_1.5k": 9,
    "OHS_s3atac_2.10k": 16,
    "OHS_s3atac_2.15k": 13,
    "OHS_s3atac_2.20k": 19,
    "OHS_s3atac_2.25k": 19,
    "OHS_s3atac_2.30k": 18,
    "OHS_s3atac_2.35k": 16,
    "OHS_s3atac_2.5k": 18,
    "SAN_10xmultiome_1.10k": 17,
    "SAN_10xmultiome_1.15k": 14,
    "SAN_10xmultiome_1.20k": 10,
    "SAN_10xmultiome_1.25k": 9,
    "SAN_10xmultiome_1.30k": 11,
    "SAN_10xmultiome_1.35k": 19,
    "SAN_10xmultiome_1.5k": 12,
    "SAN_10xmultiome_2.10k": 11,
    "SAN_10xmultiome_2.15k": 13,
    "SAN_10xmultiome_2.20k": 11,
    "SAN_10xmultiome_2.25k": 10,
    "SAN_10xmultiome_2.30k": 12,
    "SAN_10xmultiome_2.35k": 12,
    "SAN_10xmultiome_2.5k": 13,
    "STA_10xv11_1.10k": 6,
    "STA_10xv11_1.15k": 11,
    "STA_10xv11_1.20k": 13,
    "STA_10xv11_1.25k": 13,
    "STA_10xv11_1.30k": 8,
    "STA_10xv11_1.35k": 12,
    "STA_10xv11_1.5k": 11,
    "STA_10xv11_2.10k": 17,
    "STA_10xv11_2.15k": 11,
    "STA_10xv11_2.20k": 11,
    "STA_10xv11_2.25k": 13,
    "STA_10xv11_2.30k": 8,
    "STA_10xv11_2.35k": 14,
    "STA_10xv11_2.5k": 7,
    "TXG_10xv11_1.10k": 13,
    "TXG_10xv11_1.5k": 11,
    "TXG_10xv2_1.5k": 15,
    "TXG_10xv2_2.5k": 17,
    "UCS_ddseq_1.10k": 11,
    "UCS_ddseq_1.15k": 15,
    "UCS_ddseq_1.20k": 16,
    "UCS_ddseq_1.25k": 12,
    "UCS_ddseq_1.30k": 18,
    "UCS_ddseq_1.35k": 13,
    "UCS_ddseq_1.5k": 15,
    "UCS_ddseq_2.10k": 15,
    "UCS_ddseq_2.15k": 18,
    "UCS_ddseq_2.20k": 13,
    "UCS_ddseq_2.25k": 15,
    "UCS_ddseq_2.30k": 16,
    "UCS_ddseq_2.35k": 17,
    "UCS_ddseq_2.5k": 15,
    "VIB_10xmultiome_1.10k": 10,
    "VIB_10xmultiome_1.15k": 9,
    "VIB_10xmultiome_1.20k": 10,
    "VIB_10xmultiome_1.25k": 9,
    "VIB_10xmultiome_1.30k": 13,
    "VIB_10xmultiome_1.35k": 10,
    "VIB_10xmultiome_1.5k": 9,
    "VIB_10xmultiome_2.10k": 8,
    "VIB_10xmultiome_2.15k": 14,
    "VIB_10xmultiome_2.20k": 10,
    "VIB_10xmultiome_2.25k": 15,
    "VIB_10xmultiome_2.30k": 13,
    "VIB_10xmultiome_2.35k": 17,
    "VIB_10xmultiome_2.5k": 11,
    "VIB_10xv1_1.10k": 11,
    "VIB_10xv1_1.15k": 10,
    "VIB_10xv1_1.20k": 14,
    "VIB_10xv1_1.25k": 11,
    "VIB_10xv1_1.30k": 9,
    "VIB_10xv1_1.35k": 13,
    "VIB_10xv1_1.5k": 8,
    "VIB_10xv1_2.10k": 12,
    "VIB_10xv1_2.15k": 14,
    "VIB_10xv1_2.20k": 16,
    "VIB_10xv1_2.25k": 15,
    "VIB_10xv1_2.30k": 18,
    "VIB_10xv1_2.5k": 15,
    "VIB_10xv2_1.10k": 11,
    "VIB_10xv2_1.15k": 10,
    "VIB_10xv2_1.20k": 10,
    "VIB_10xv2_1.25k": 8,
    "VIB_10xv2_1.30k": 10,
    "VIB_10xv2_1.35k": 16,
    "VIB_10xv2_1.5k": 10,
    "VIB_10xv2_2.10k": 12,
    "VIB_10xv2_2.15k": 13,
    "VIB_10xv2_2.20k": 13,
    "VIB_10xv2_2.25k": 17,
    "VIB_10xv2_2.30k": 9,
    "VIB_10xv2_2.35k": 17,
    "VIB_10xv2_2.5k": 13,
    "VIB_hydrop_1.10k": 10,
    "VIB_hydrop_1.15k": 14,
    "VIB_hydrop_1.20k": 10,
    "VIB_hydrop_1.25k": 10,
    "VIB_hydrop_1.30k": 17,
    "VIB_hydrop_1.35k": 17,
    "VIB_hydrop_1.5k": 19,
    "VIB_hydrop_2.10k": 11,
    "VIB_hydrop_2.15k": 15,
    "VIB_hydrop_2.20k": 10,
    "VIB_hydrop_2.25k": 10,
    "VIB_hydrop_2.30k": 11,
    "VIB_hydrop_2.35k": 10,
    "VIB_hydrop_2.5k": 10,
    "CNA_10xv2_2.20k": 10,
    "CNA_10xv2_2.25k": 16,
    "CNA_10xv2_2.30k": 15,
    "CNA_10xv2_2.35k": 9,
    "MDC_mtscatac_1.25k": 12,
    "MDC_mtscatac_1.30k": 12,
    "MDC_mtscatac_1.35k": 12,
    "TXG_10xv11_1.15k": 18,
    "TXG_10xv11_1.20k": 20,
    "TXG_10xv11_1.25k": 18,
    "TXG_10xv11_1.30k": 15,
    "TXG_10xv11_1.35k": 11,
    "TXG_10xv2_1.10k": 18,
    "TXG_10xv2_1.15k": 20,
    "TXG_10xv2_1.20k": 19,
    "TXG_10xv2_1.25k": 19,
    "TXG_10xv2_1.30k": 14,
    "TXG_10xv2_1.35k": 17,
    "TXG_10xv2_2.10k": 16,
    "TXG_10xv2_2.15k": 19,
    "TXG_10xv2_2.20k": 13,
    "TXG_10xv2_2.25k": 13,
    "TXG_10xv2_2.30k": 14,
    "TXG_10xv2_2.35k": 20,
    "VIB_10xv1_2.35k": 20,
}

In [32]:
for sample in models_path_dict.keys():
    if sample not in optimal_topics_dict.keys():
        print(f"adding {sample}")
        optimal_topics_dict[sample] = 10
    else:
        print(f"{sample} in optimal_topics_dict")

optimal_topics_dict

BIO_ddseq_1.10k in optimal_topics_dict
BIO_ddseq_1.15k in optimal_topics_dict
BIO_ddseq_1.20k in optimal_topics_dict
BIO_ddseq_1.25k in optimal_topics_dict
BIO_ddseq_1.30k in optimal_topics_dict
BIO_ddseq_1.35k in optimal_topics_dict
BIO_ddseq_1.5k in optimal_topics_dict
BIO_ddseq_2.10k in optimal_topics_dict
BIO_ddseq_2.15k in optimal_topics_dict
BIO_ddseq_2.20k in optimal_topics_dict
BIO_ddseq_2.25k in optimal_topics_dict
BIO_ddseq_2.30k in optimal_topics_dict
BIO_ddseq_2.35k in optimal_topics_dict
BIO_ddseq_2.5k in optimal_topics_dict
BIO_ddseq_3.10k in optimal_topics_dict
BIO_ddseq_3.15k in optimal_topics_dict
BIO_ddseq_3.20k in optimal_topics_dict
BIO_ddseq_3.25k in optimal_topics_dict
BIO_ddseq_3.30k in optimal_topics_dict
BIO_ddseq_3.35k in optimal_topics_dict
BIO_ddseq_3.5k in optimal_topics_dict
BIO_ddseq_4.10k in optimal_topics_dict
BIO_ddseq_4.15k in optimal_topics_dict
BIO_ddseq_4.20k in optimal_topics_dict
BIO_ddseq_4.25k in optimal_topics_dict
BIO_ddseq_4.30k in optimal_t

{'BIO_ddseq_1.10k': 11,
 'BIO_ddseq_1.15k': 16,
 'BIO_ddseq_1.20k': 18,
 'BIO_ddseq_1.25k': 18,
 'BIO_ddseq_1.30k': 16,
 'BIO_ddseq_1.35k': 17,
 'BIO_ddseq_1.5k': 17,
 'BIO_ddseq_2.10k': 18,
 'BIO_ddseq_2.15k': 18,
 'BIO_ddseq_2.20k': 17,
 'BIO_ddseq_2.25k': 17,
 'BIO_ddseq_2.30k': 17,
 'BIO_ddseq_2.35k': 18,
 'BIO_ddseq_2.5k': 14,
 'BIO_ddseq_3.10k': 11,
 'BIO_ddseq_3.15k': 13,
 'BIO_ddseq_3.20k': 8,
 'BIO_ddseq_3.25k': 10,
 'BIO_ddseq_3.30k': 14,
 'BIO_ddseq_3.35k': 10,
 'BIO_ddseq_3.5k': 17,
 'BIO_ddseq_4.10k': 9,
 'BIO_ddseq_4.15k': 15,
 'BIO_ddseq_4.20k': 12,
 'BIO_ddseq_4.25k': 17,
 'BIO_ddseq_4.30k': 12,
 'BIO_ddseq_4.35k': 11,
 'BIO_ddseq_4.5k': 14,
 'BRO_mtscatac_1.10k': 11,
 'BRO_mtscatac_1.15k': 12,
 'BRO_mtscatac_1.20k': 10,
 'BRO_mtscatac_1.25k': 11,
 'BRO_mtscatac_1.30k': 9,
 'BRO_mtscatac_1.35k': 11,
 'BRO_mtscatac_1.5k': 12,
 'BRO_mtscatac_2.10k': 12,
 'BRO_mtscatac_2.15k': 10,
 'BRO_mtscatac_2.20k': 13,
 'BRO_mtscatac_2.25k': 11,
 'BRO_mtscatac_2.30k': 12,
 'BRO_mtscat

In [33]:
from pycisTopic.lda_models import evaluate_models
import matplotlib.pyplot as plt

In [35]:
write = True
for sample in optimal_topics_dict.keys():
    # for sample in ['VIB_hydrop_2.FIXEDCELLS']:
    cto_path = cto_singlets_path_dict[sample]
    print(sample)
    ntopics = optimal_topics_dict[sample]
    cto_path_new = cto_path.replace(".pkl", f".model_{ntopics}topics.pkl")
    print(cto_path_new)
    if not os.path.isfile(cto_path_new):
        if sample in list(models_path_dict.keys()):
            models_path = models_path_dict[sample]
            with open(models_path, "rb") as f:
                models = pickle.load(f)

            print(f"\tLoaded {models_path}, evaluating...")
            model = evaluate_models(
                models,
                select_model=optimal_topics_dict[sample],
                return_model=True,
                metrics=["Arun_2010", "Cao_Juan_2009", "Minmo_2011", "loglikelihood"],
                plot=True,  # disabled since we only test one model here
                plot_metrics=False,
                save=f"plots_qc/{sample}__model_evaluation.png",
            )

            if write == True:
                with open(cto_path, "rb") as f:
                    cto = pickle.load(f)

                cto.add_LDA_model(model)

                with open(cto_path_new, "wb") as f:
                    pickle.dump(cto, f, protocol=4)

        else:
            print(f"\t{sample} models does not exist!")
    else:
        print(f"\t{cto_path_new} already exists! Skipping...")

BIO_ddseq_1.10k
10k/cistopic_objects/BIO_ddseq_1.10k__cto.model_11topics.pkl
	10k/cistopic_objects/BIO_ddseq_1.10k__cto.model_11topics.pkl already exists! Skipping...
BIO_ddseq_1.15k
15k/cistopic_objects/BIO_ddseq_1.15k__cto.model_16topics.pkl
	15k/cistopic_objects/BIO_ddseq_1.15k__cto.model_16topics.pkl already exists! Skipping...
BIO_ddseq_1.20k
20k/cistopic_objects/BIO_ddseq_1.20k__cto.model_18topics.pkl
	20k/cistopic_objects/BIO_ddseq_1.20k__cto.model_18topics.pkl already exists! Skipping...
BIO_ddseq_1.25k
25k/cistopic_objects/BIO_ddseq_1.25k__cto.model_18topics.pkl
	25k/cistopic_objects/BIO_ddseq_1.25k__cto.model_18topics.pkl already exists! Skipping...
BIO_ddseq_1.30k
30k/cistopic_objects/BIO_ddseq_1.30k__cto.model_16topics.pkl
	30k/cistopic_objects/BIO_ddseq_1.30k__cto.model_16topics.pkl already exists! Skipping...
BIO_ddseq_1.35k
35k/cistopic_objects/BIO_ddseq_1.35k__cto.model_17topics.pkl
	35k/cistopic_objects/BIO_ddseq_1.35k__cto.model_17topics.pkl already exists! Skipping..