In [1]:
from pyscenic.rnkdb import FeatherRankingDatabase, InvertedRankingDatabase
from pyscenic.genesig import GeneSignature
from typing import Type, Tuple
import os
import numpy as np
import pandas as pd
from feather.api import write_dataframe, FeatherReader
from tqdm import tqdm
from random import shuffle

In [2]:
DB_FOLDER = "/Users/bramvandesande/Projects/lcb/databases"
TOP_N = 10000
NOMENCLATURE = "regions"

In [3]:
db = FeatherRankingDatabase(fname=os.path.join(DB_FOLDER, "hg19-regions-220330-9species.extracted.feather"),
                            name="regions", nomenclature=NOMENCLATURE)

In [4]:
len(db.genes)

220329

In [5]:
InvertedRankingDatabase.invert(db, os.path.join(DB_FOLDER, "hg19-regions-220330-9species.inverted.feather"), TOP_N)

9713it [09:02, 17.89it/s]


In [5]:
!ls -lh {DB_FOLDER}/hg19-regions-220330-9species.*.feather

-rw-r--r--  1 bramvandesande  staff   8.0G Mar 28 15:27 /Users/bramvandesande/Projects/lcb/databases/hg19-regions-220330-9species.extracted.feather
-rw-r--r--  1 bramvandesande  staff   371M Apr  5 19:04 /Users/bramvandesande/Projects/lcb/databases/hg19-regions-220330-9species.inverted.feather


In [6]:
def create_dummy_signature(n):
    with open(os.path.join(DB_FOLDER, "hg19-regions-220330-9species.inverted.identifiers.txt") , 'r') as f:
        ids = list(map(lambda s: s.strip(), f))
        shuffle(ids)
        return GeneSignature("test_regions", "regionIDs", ids[:n])

In [7]:
signatures = [create_dummy_signature(2500) for _ in range(5)]

In [8]:
len(signatures)

5

In [9]:
inv_db = InvertedRankingDatabase(os.path.join(DB_FOLDER, "hg19-regions-220330-9species.inverted.feather"), "hg19-regions-220330-9species", NOMENCLATURE)

In [10]:
len(inv_db.genes)

220330

In [20]:
%%timeit -r1 -n1
for gs in signatures:
    inv_db.load(gs)

2min 11s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [13]:
rankings = inv_db.load(signatures[0])

In [34]:
rankings.sort_index(axis=1).head()

Unnamed: 0,chr1-reg100886,chr1-reg100956,chr1-reg101346,chr1-reg101355,chr1-reg10145,chr1-reg10221,chr1-reg102658,chr1-reg103856,chr1-reg104292,chr1-reg104299,...,chrX-reg57104,chrX-reg57589,chrX-reg6193,chrX-reg6216,chrX-reg645,chrX-reg9194,chrX-reg9293,chrY-reg261,chrY-reg367,chrY-reg677
elemento-AAAATGGCG,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4794,4294967295,4294967295,4294967295,...,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295
elemento-AAATCAAT,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,...,4294967295,4294967295,4294967295,9236,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295
elemento-AAATGCAAA,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,3882,4294967295,4294967295,4294967295,...,4294967295,4294967295,4294967295,9634,8562,4294967295,4294967295,4294967295,4294967295,4294967295
elemento-AAATTGCA,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,6033,4294967295,4294967295,...,4294967295,4294967295,4294967295,4467,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295
elemento-AACAGCTG,4294967295,4294967295,4294967295,4294967295,4294967295,6897,4294967295,4294967295,4294967295,4294967295,...,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295


In [36]:
rankings_ = db.load(signatures[0]).sort_index(axis=1).astype(np.uint32)
rankings_[rankings_ >= 10000] = 4294967295

In [37]:
rankings_.head()

Unnamed: 0_level_0,chr1-reg100886,chr1-reg100956,chr1-reg101346,chr1-reg101355,chr1-reg10145,chr1-reg10221,chr1-reg102658,chr1-reg103856,chr1-reg104292,chr1-reg104299,...,chrX-reg57104,chrX-reg57589,chrX-reg6193,chrX-reg6216,chrX-reg645,chrX-reg9194,chrX-reg9293,chrY-reg261,chrY-reg367,chrY-reg677
features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
elemento-AAAATGGCG,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4794,4294967295,4294967295,4294967295,...,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295
elemento-AAATCAAT,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,...,4294967295,4294967295,4294967295,9236,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295
elemento-AAATGCAAA,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,3882,4294967295,4294967295,4294967295,...,4294967295,4294967295,4294967295,9634,8562,4294967295,4294967295,4294967295,4294967295,4294967295
elemento-AAATTGCA,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,6033,4294967295,4294967295,...,4294967295,4294967295,4294967295,4467,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295
elemento-AACAGCTG,4294967295,4294967295,4294967295,4294967295,4294967295,6897,4294967295,4294967295,4294967295,4294967295,...,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295
