# pycisTopic analysis

Full dataset, using SCREEN regions.

In [1]:
import pycisTopic
pycisTopic.__version__
from IPython.display import Image, display

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [3]:
import pickle
import pandas as pd

In [4]:
import glob
from collections import OrderedDict
import numpy as np

In [5]:
!pwd

/lustre1/project/stg_00090/scatac_benchmark/fixedcells_2_cistopic


In [6]:
import os
wdir = '/lustre1/project/stg_00090/scatac_benchmark/fixedcells_2_cistopic/'
os.chdir( wdir )

In [7]:
scrub_name_suffix = "0-4"
cto_paths = sorted(glob.glob('cistopic_objects/*.singlets.pkl'))
cistopic_obj_path_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.pkl')[0]:x for x in cto_paths}
cistopic_obj_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xv11_2.FIXEDCELLS'

In [8]:
mounts = "/lustre1,/staging,/data,/vsc-hard-mounts,/scratch"
sif = "../0_resources/cistopic_image/20220815_pycistopic.sif"
script = "../0_resources/scripts/runModels_lda_mallet.py"

f_mod_dir = 'models__screen__mallet'
if not os.path.exists(os.path.join(wdir, f_mod_dir)):
    os.makedirs(os.path.join(wdir, f_mod_dir))

n_topics_str = "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40"
n_topics = 40

ver=n_topics_str[-2] + n_topics_str[-1]

n_cores = 36
n_iter = "400"
for sample in cistopic_obj_path_dict.keys():
    infile = cistopic_obj_path_dict[sample]
    outfile = os.path.join(f_mod_dir, sample + '__models_' + str(n_iter) + '_iter.pkl')
    
    intermediate_dir = f"{f_mod_dir}/model_intermediates/{sample}_model_intermediates/"
        
    if not os.path.exists(outfile):
        if not os.path.exists(os.path.join(wdir, intermediate_dir)):
            os.makedirs(os.path.join(wdir, intermediate_dir))
        else:
            print(f'{intermediate_dir} already exists!')
            
        n_topics_detected = len(os.listdir(os.path.join(wdir, intermediate_dir)))
        if not n_topics_detected == n_topics:
            model_tmp_dir = f"{f_mod_dir}/model_tmp/{sample}_model_tmp/"
            if not os.path.exists(os.path.join(wdir, model_tmp_dir)):
                os.makedirs(os.path.join(wdir, model_tmp_dir))

            n_topics_todo = str(list(range(n_topics_detected,n_topics +1))).replace(' ','').replace('[','').replace(']','')

            out_log = f"{f_mod_dir}/{sample}.models_out_log{ver}.txt"
            cmd = f"echo {sample} && cd {wdir} && singularity exec -B {mounts} {sif} python {script} -i {infile} -o {outfile} -nt {n_topics_todo} -c {n_cores} -it {n_iter} -a 50 -abt True -e 0.1 -ebt False -sp {intermediate_dir} -s 555 -td {model_tmp_dir} > {out_log}"

            print(cmd)
            sh_path = f"{f_mod_dir}/{sample}.runmodels{ver}.sh"
            with open(sh_path, 'w') as file:
                file.write(cmd)

    else:
        print(f'{outfile} already exists!')
    
    print('\n')

models__screen__mallet/BIO_ddseq_1.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/BIO_ddseq_2.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/BIO_ddseq_3.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/BIO_ddseq_4.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/BRO_mtscatac_1.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/BRO_mtscatac_2.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/CNA_10xmultiome_1.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/CNA_10xmultiome_2.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/CNA_10xv11_1.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/CNA_10xv11_2.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/CNA_10xv11_3.FIXEDCELLS__models_400_iter.pkl already exists!


models__screen__mallet/CNA_10xv11_4.FIXEDCEL

In [9]:
!cat ../0_resources/scripts/runModels_lda_mallet.py

import pickle
import sys
import argparse
import os
from pycisTopic.cistopic_class import *
from pycisTopic.lda_models import *

def make_argument_parser():
    """
    Creates an ArgumentParser to read the options for this script from
    sys.argv
    """
    parser = argparse.ArgumentParser(
        description="Run topic models.",)
    parser.add_argument('--inputcisTopic_obj', '-i', type=str, required=True,
                        help='Path to cisTopic object pickle file.')
    parser.add_argument('--output', '-o', type=str, required=True,
                        help='Path to save final model list.')
    parser.add_argument('--n_topics', '-nt', type=str, required=True, nargs='+',
                        help='Txt file containing selected topic id.')
    parser.add_argument('--n_cpu', '-c', type=int, required=True,
                        help = 'Number of cores')
    parser.add_argument('--n_iter', '-it', type=int, required=False, default=150,
                        help = 'Numbe

In shell:

In [10]:
for script in models__screen__mallet/*40.sh
do
    echo $script
    qsub $script -A lp_symbiosys -l nodes=1:ppn=36 -l walltime=12:00:00 -l pmem=2gb
done

SyntaxError: invalid syntax (3556938612.py, line 1)

Then, combine the models:

In [11]:
intermediate_dict = {x.split('/')[-1].split('_model_intermediates')[0]: x for x in sorted(glob.glob(f'{f_mod_dir}/model_intermediates/*'))}
intermediate_dict

{'BIO_ddseq_1.FIXEDCELLS': 'models__screen__mallet/model_intermediates/BIO_ddseq_1.FIXEDCELLS_model_intermediates',
 'BIO_ddseq_2.FIXEDCELLS': 'models__screen__mallet/model_intermediates/BIO_ddseq_2.FIXEDCELLS_model_intermediates',
 'BIO_ddseq_3.FIXEDCELLS': 'models__screen__mallet/model_intermediates/BIO_ddseq_3.FIXEDCELLS_model_intermediates',
 'BIO_ddseq_4.FIXEDCELLS': 'models__screen__mallet/model_intermediates/BIO_ddseq_4.FIXEDCELLS_model_intermediates',
 'BRO_mtscatac_1.FIXEDCELLS': 'models__screen__mallet/model_intermediates/BRO_mtscatac_1.FIXEDCELLS_model_intermediates',
 'BRO_mtscatac_2.FIXEDCELLS': 'models__screen__mallet/model_intermediates/BRO_mtscatac_2.FIXEDCELLS_model_intermediates',
 'CNA_10xmultiome_1.FIXEDCELLS': 'models__screen__mallet/model_intermediates/CNA_10xmultiome_1.FIXEDCELLS_model_intermediates',
 'CNA_10xmultiome_2.FIXEDCELLS': 'models__screen__mallet/model_intermediates/CNA_10xmultiome_2.FIXEDCELLS_model_intermediates',
 'CNA_10xv11_1.FIXEDCELLS': 'models_

In [12]:
n_models = 40 
for sample, directory in intermediate_dict.items():
    nmodels = len(os.listdir(directory))
    print(f"{nmodels} models in {directory}")
    save_path = f"{f_mod_dir}/{sample}__models_{n_iter}_iter.pkl"
    if not os.path.exists(save_path):
        file_list = sorted(os.listdir(directory))
        n_topics = [int(x.split('Topic')[-1].split('.pkl')[0]) for x in file_list]
        file_list_sorted = [file_list[x] for x in np.argsort(n_topics)]
        
        if len(os.listdir(directory)) == n_models:
            print(f"\tsaving at {save_path}")

            models_merged = [pickle.load(open(f"{directory}/{file}", 'rb')) for file in file_list_sorted]

            with open(save_path, "wb") as f:
                    pickle.dump(models_merged, f, protocol=4)
                    
        else:
            print(f'\tn_models < {n_models}, {len(sorted(os.listdir(directory)))}')
    else:
        print(f"\t{save_path} exists, skipping")

0 models in models__screen__mallet/model_intermediates/BIO_ddseq_1.FIXEDCELLS_model_intermediates
	models__screen__mallet/BIO_ddseq_1.FIXEDCELLS__models_400_iter.pkl exists, skipping
0 models in models__screen__mallet/model_intermediates/BIO_ddseq_2.FIXEDCELLS_model_intermediates
	models__screen__mallet/BIO_ddseq_2.FIXEDCELLS__models_400_iter.pkl exists, skipping
0 models in models__screen__mallet/model_intermediates/BIO_ddseq_3.FIXEDCELLS_model_intermediates
	models__screen__mallet/BIO_ddseq_3.FIXEDCELLS__models_400_iter.pkl exists, skipping
0 models in models__screen__mallet/model_intermediates/BIO_ddseq_4.FIXEDCELLS_model_intermediates
	models__screen__mallet/BIO_ddseq_4.FIXEDCELLS__models_400_iter.pkl exists, skipping
0 models in models__screen__mallet/model_intermediates/BRO_mtscatac_1.FIXEDCELLS_model_intermediates
	models__screen__mallet/BRO_mtscatac_1.FIXEDCELLS__models_400_iter.pkl exists, skipping
0 models in models__screen__mallet/model_intermediates/BRO_mtscatac_2.FIXEDCELL

In [13]:
from pycisTopic.lda_models import evaluate_models
import matplotlib.pyplot as plt

In [14]:
cto_singlets_path_dict = {x.split('/')[-1].split(f'__cto.scrublet{scrub_name_suffix}.fmx.singlets.pkl')[0]: x for x in sorted(glob.glob("cistopic_objects/*singlets.pkl"))}
cto_singlets_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'cistopic_objects/BRO_mtscatac_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'cistopic_objects/CNA_10xmultiome_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'cistopic_objects/CNA_10xv11_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.pkl',
 'CNA_10xv11_2.FIXEDCELLS'

In [15]:
n_iter = 400

In [16]:
models_path_dict = {x.split('/')[-1].split(f'__models_{n_iter}_iter.pkl')[0]: x for x in sorted(glob.glob(f'{f_mod_dir}/*{n_iter}_iter.pkl'))}
models_path_dict

{'BIO_ddseq_1.FIXEDCELLS': 'models__screen__mallet/BIO_ddseq_1.FIXEDCELLS__models_400_iter.pkl',
 'BIO_ddseq_2.FIXEDCELLS': 'models__screen__mallet/BIO_ddseq_2.FIXEDCELLS__models_400_iter.pkl',
 'BIO_ddseq_3.FIXEDCELLS': 'models__screen__mallet/BIO_ddseq_3.FIXEDCELLS__models_400_iter.pkl',
 'BIO_ddseq_4.FIXEDCELLS': 'models__screen__mallet/BIO_ddseq_4.FIXEDCELLS__models_400_iter.pkl',
 'BRO_mtscatac_1.FIXEDCELLS': 'models__screen__mallet/BRO_mtscatac_1.FIXEDCELLS__models_400_iter.pkl',
 'BRO_mtscatac_2.FIXEDCELLS': 'models__screen__mallet/BRO_mtscatac_2.FIXEDCELLS__models_400_iter.pkl',
 'CNA_10xmultiome_1.FIXEDCELLS': 'models__screen__mallet/CNA_10xmultiome_1.FIXEDCELLS__models_400_iter.pkl',
 'CNA_10xmultiome_2.FIXEDCELLS': 'models__screen__mallet/CNA_10xmultiome_2.FIXEDCELLS__models_400_iter.pkl',
 'CNA_10xv11_1.FIXEDCELLS': 'models__screen__mallet/CNA_10xv11_1.FIXEDCELLS__models_400_iter.pkl',
 'CNA_10xv11_2.FIXEDCELLS': 'models__screen__mallet/CNA_10xv11_2.FIXEDCELLS__models_400_i

In [17]:
optimal_topics_dict = {x: 5 for x in models_path_dict.keys()}

In [22]:
optimal_topics_dict = {'BIO_ddseq_1.FIXEDCELLS': 9,
 'BIO_ddseq_2.FIXEDCELLS': 14,
 'BIO_ddseq_3.FIXEDCELLS': 11,
 'BIO_ddseq_4.FIXEDCELLS': 7,
 'BRO_mtscatac_1.FIXEDCELLS': 12,
 'BRO_mtscatac_2.FIXEDCELLS': 10,
 'CNA_10xmultiome_1.FIXEDCELLS': 12,
 'CNA_10xmultiome_2.FIXEDCELLS': 11,
 'CNA_10xv11_1.FIXEDCELLS': 11,
 'CNA_10xv11_2.FIXEDCELLS': 15,
 'CNA_10xv11_3.FIXEDCELLS': 9,
 'CNA_10xv11_4.FIXEDCELLS': 10,
 'CNA_10xv11_5.FIXEDCELLS': 12,
 'CNA_10xv2_1.FIXEDCELLS': 11,
 'CNA_10xv2_2.FIXEDCELLS': 8,
 'CNA_hydrop_1.FIXEDCELLS': 11,
 'CNA_hydrop_2.FIXEDCELLS': 10,
 'CNA_hydrop_3.FIXEDCELLS': 15,
 'CNA_mtscatac_1.FIXEDCELLS': 11,
 'CNA_mtscatac_2.FIXEDCELLS': 14,
 'EPF_hydrop_1.FIXEDCELLS': 9,
 'EPF_hydrop_2.FIXEDCELLS': 7,
 'EPF_hydrop_3.FIXEDCELLS': 9,
 'EPF_hydrop_4.FIXEDCELLS': 15,
 'HAR_ddseq_1.FIXEDCELLS': 17,
 'HAR_ddseq_2.FIXEDCELLS': 9,
 'MDC_mtscatac_1.FIXEDCELLS': 9,
 'MDC_mtscatac_2.FIXEDCELLS': 10,
 'OHS_s3atac_1.FIXEDCELLS': 8,
 'OHS_s3atac_2.FIXEDCELLS': 12,
 'SAN_10xmultiome_1.FIXEDCELLS': 9,
 'SAN_10xmultiome_2.FIXEDCELLS': 10,
 'STA_10xv11_1.FIXEDCELLS': 8,
 'STA_10xv11_2.FIXEDCELLS': 12,
 'TXG_10xv11_1.FIXEDCELLS': 15,
 'TXG_10xv2_1.FIXEDCELLS': 15,
 'TXG_10xv2_2.FIXEDCELLS': 19,
 'UCS_ddseq_1.FIXEDCELLS': 10,
 'UCS_ddseq_2.FIXEDCELLS': 10,
 'VIB_10xmultiome_1.FIXEDCELLS': 11,
 'VIB_10xmultiome_2.FIXEDCELLS': 12,
 'VIB_10xv1_1.FIXEDCELLS': 7,
 'VIB_10xv1_2.FIXEDCELLS': 9,
 'VIB_10xv2_1.FIXEDCELLS': 10,
 'VIB_10xv2_2.FIXEDCELLS': 11,
 'VIB_hydrop_1.FIXEDCELLS': 11,
 'VIB_hydrop_11.FIXEDCELLS': 10,
 'VIB_hydrop_12.FIXEDCELLS': 10,
 'VIB_hydrop_2.FIXEDCELLS': 8,
 'VIB_hydrop_21.FIXEDCELLS': 12,
 'VIB_hydrop_22.FIXEDCELLS': 9}

In [23]:
import pprint

In [24]:
for sample in models_path_dict.keys():
    if sample not in optimal_topics_dict.keys():
        print(f"adding {sample}")
        optimal_topics_dict[sample] = 10
    else:
        print(f"{sample} in optimal_topics_dict")

pprint.pprint(optimal_topics_dict)

BIO_ddseq_1.FIXEDCELLS in optimal_topics_dict
BIO_ddseq_2.FIXEDCELLS in optimal_topics_dict
BIO_ddseq_3.FIXEDCELLS in optimal_topics_dict
BIO_ddseq_4.FIXEDCELLS in optimal_topics_dict
BRO_mtscatac_1.FIXEDCELLS in optimal_topics_dict
BRO_mtscatac_2.FIXEDCELLS in optimal_topics_dict
CNA_10xmultiome_1.FIXEDCELLS in optimal_topics_dict
CNA_10xmultiome_2.FIXEDCELLS in optimal_topics_dict
CNA_10xv11_1.FIXEDCELLS in optimal_topics_dict
CNA_10xv11_2.FIXEDCELLS in optimal_topics_dict
CNA_10xv11_3.FIXEDCELLS in optimal_topics_dict
CNA_10xv11_4.FIXEDCELLS in optimal_topics_dict
CNA_10xv11_5.FIXEDCELLS in optimal_topics_dict
CNA_10xv2_1.FIXEDCELLS in optimal_topics_dict
CNA_10xv2_2.FIXEDCELLS in optimal_topics_dict
CNA_hydrop_1.FIXEDCELLS in optimal_topics_dict
CNA_hydrop_2.FIXEDCELLS in optimal_topics_dict
CNA_hydrop_3.FIXEDCELLS in optimal_topics_dict
CNA_mtscatac_1.FIXEDCELLS in optimal_topics_dict
CNA_mtscatac_2.FIXEDCELLS in optimal_topics_dict
EPF_hydrop_1.FIXEDCELLS in optimal_topics_dict
E

In [26]:
from IPython.display import IFrame
write = True
for sample in models_path_dict.keys():
    cto_path = cto_singlets_path_dict[sample]
    print(sample)
    ntopics = optimal_topics_dict[sample]
    cto_path_new = cto_path.replace('.pkl', f'.model_{ntopics}topics.pkl')
    print(cto_path_new)
    if not os.path.isfile(cto_path_new):
        if(sample in list(models_path_dict.keys())):
            models_path = models_path_dict[sample]
            with open(models_path, 'rb') as f:
                models = pickle.load(f)

            print(f"\tLoaded {models_path}, evaluating...")
            model = evaluate_models(models,
                         select_model=optimal_topics_dict[sample], 
                         return_model=True, 
                         metrics=['Arun_2010','Cao_Juan_2009', 'Minmo_2011', 'loglikelihood'],
                         plot=True, # disabled since we only test one model here
                         plot_metrics=False,
                         save=f"plots_qc/{sample}__model_evaluation.png"
                         )
                        
            if write == True:
                with open(cto_path, 'rb') as f:
                    cto = pickle.load(f)

                cto.add_LDA_model(model)

                with open(cto_path_new, "wb") as f:
                    pickle.dump(cto, f, protocol=4)

        else:
            print(f"\t{sample} models does not exist!")
    else:
        print(f"\t{cto_path_new} already exists! Skipping...")
        # display(IFrame(f"plots_qc/{sample}__model_evaluation.pdf", width=600, height=300))

BIO_ddseq_1.FIXEDCELLS
cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_9topics.pkl
	cistopic_objects/BIO_ddseq_1.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_9topics.pkl already exists! Skipping...
BIO_ddseq_2.FIXEDCELLS
cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_14topics.pkl
	cistopic_objects/BIO_ddseq_2.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_14topics.pkl already exists! Skipping...
BIO_ddseq_3.FIXEDCELLS
cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.pkl
	cistopic_objects/BIO_ddseq_3.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_11topics.pkl already exists! Skipping...
BIO_ddseq_4.FIXEDCELLS
cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_7topics.pkl
	cistopic_objects/BIO_ddseq_4.FIXEDCELLS__cto.scrublet0-4.fmx.singlets.model_7topics.pkl already exists! Skipping...
BRO_mtscatac_1.FIXEDCELLS
cistopic_objects/BRO_mtscatac_1.FIXEDCELLS__cto.scrublet0-