In [2]:
import pycisTopic
pycisTopic.__version__

'0.1.dev300+g7494158'

In [3]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore') 

In [4]:
import pickle
import pandas as pd

In [5]:
import os
wdir = '/lustre1/project/stg_00002/lcb/fderop/data/20211013_hca_benchmark_libds/'
os.chdir( wdir )

In [6]:
import glob
from collections import OrderedDict
filenames = glob.glob('fragments_postbap/*Hydrop*.sinto.mm.fragments.tsv.gz')
samples = [item.replace(".sinto.mm.fragments.tsv.gz", "") for item in filenames]
samples = [item.replace("fragments_postbap/", "") for item in samples]
fragments_dict = {samples[i]: filenames[i] for i in range(len(samples))}
fragments_dict = OrderedDict(sorted(fragments_dict.items()))
fragments_dict.keys()

odict_keys(['VIB_Hydrop_1', 'VIB_Hydrop_2'])

# read CTOs

In [7]:
f_cto_dir = 'pycistopic_consensus_peaks/cistopic_objs__consensus'

cistopic_obj_dict = {}
for key in fragments_dict.keys():
    f_cto = os.path.join(wdir, f_cto_dir, key + '__cistopic_obj_metadata_annotated.pkl')
    if(os.path.isfile(f_cto)):
        with open(f_cto, 'rb') as f:
            cistopic_obj_dict[key] = pickle.load(f)
        print(f"Loaded filtered cistopic object {key}")
    else:
        print(f"file {f_cto} doesn't exist")

Loaded filtered cistopic object VIB_Hydrop_1
Loaded filtered cistopic object VIB_Hydrop_2


### Run models for quick visualize/clustering

In [10]:
from pycisTopic.lda_models import run_cgs_models_mallet

#### mallet models

In [11]:
# set the memory limit for mallet (1GB default is not enough)
os.environ['MALLET_MEMORY'] = '100G'

In [15]:
f_mod_dir = 'pycistopic_consensus_peaks/models__consensus__mallet'
if not os.path.exists(os.path.join(wdir, f_mod_dir)):
    os.makedirs(os.path.join(wdir, f_mod_dir))

f_mod_tmpdir = '/scratch/leuven/330/vsc33042/tmp/mallet_cellds'
if not os.path.exists(f_mod_tmpdir):
    os.makedirs(f_mod_tmpdir)

n_topics = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# n_topics_merged = [2, 5, 10, 20, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100]
n_iter = 500

In [16]:
# mallet implementation
models_dict = {}
for key in cistopic_obj_dict.keys():
    f_mod = os.path.join(wdir, f_mod_dir, key + '__models_' + str(n_iter) + '_iter.pkl')
    if os.path.isfile(f_mod):
        print(f"Loading {f_mod}")
        with open(f_mod, 'rb') as f:
            models_dict[key] = pickle.load(f)
    else:
        print(f"Running {key}")
        model = run_cgs_models_mallet(
                            'mallet',
                            cistopic_obj_dict[key],
                            n_topics=n_topics_merged if key=='merged' else n_topics ,
                            n_cpu=32,
                            n_iter=n_iter,
                            random_state=555,
                            alpha=50,
                            alpha_by_topic=True,
                            eta=0.1,
                            eta_by_topic=False,
                            tmp_path=os.path.join(f_mod_tmpdir, key + '_')
                            )
        # Save
        with open(f_mod, 'wb') as f:
            pickle.dump(model, f)
        print(f"Finished {key}")
        models_dict[key] = model

Running VIB_Hydrop_1
2021-10-22 11:44:04,753 cisTopic     INFO     Formatting input to corpus
2021-10-22 11:44:04,829 gensim.corpora.dictionary INFO     adding document #0 to Dictionary(0 unique tokens: [])
2021-10-22 11:44:08,205 gensim.corpora.dictionary INFO     built Dictionary(89030 unique tokens: ['0', '1', '2', '3', '4']...) from 2208 documents (total 4200841 corpus positions)
2021-10-22 11:44:08,206 cisTopic     INFO     Running model with 2 topics
2021-10-22 11:44:08,210 LDAMalletWrapper INFO     Serializing temporary corpus to /scratch/leuven/330/vsc33042/tmp/mallet_cellds/VIB_Hydrop_1_corpus.txt
2021-10-22 11:44:17,759 LDAMalletWrapper INFO     Converting temporary corpus to MALLET format with mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /scratch/leuven/330/vsc33042/tmp/mallet_cellds/VIB_Hydrop_1_corpus.txt --output /scratch/leuven/330/vsc33042/tmp/mallet_cellds/VIB_Hydrop_1_corpus.mallet
2021-10-22 11:44:20,030 LDAMalletW