In [1]:
import numpy as np

import tfmindi as tm



In [2]:
motif_collection_dir = tm.fetch_motif_collection()
motif_annotations_file = tm.fetch_motif_annotations()

In [3]:
motif_collection = tm.load_motif_collection(motif_collection_dir)
motif_annotations = tm.load_motif_annotations(motif_annotations_file)

In [4]:
motif_to_db = tm.load_motif_to_dbd(motif_annotations)  # dict of motif to dbd

In [5]:
oh_path = "tests/data/sample_oh.npz"
contrib_path = "tests/data/sample_contrib.npz"

oh = np.load(oh_path)["oh"]
contrib = np.load(contrib_path)["contrib"]

In [6]:
seqlets_df, seqlets_matrix = tm.pp.extract_seqlets(contrib=contrib, oh=oh, threshold=0.05, additional_flanks=3)

Processing seqlets: 100%|██████████| 295/295 [00:00<00:00, 8398.80it/s]


In [7]:
seqlets_df.head()  # Display the first few rows of the extracted seqlets DataFrame

Unnamed: 0,example_idx,start,end,attribution,p-value
0,24,59,76,-0.467904,1.887379e-15
1,31,178,198,1.294061,0.0002467105
2,28,299,318,-0.457312,0.0007206342
3,24,306,324,0.948738,0.0007408627
4,32,455,480,1.100183,0.001234263


In [8]:
print(seqlets_matrix[0].shape)
print(len(seqlets_matrix))

(4, 17)
295


In [9]:
sim_matrix = tm.pp.calculate_motif_similarity(seqlets_matrix, motif_collection)

In [10]:
sim_matrix.shape

(295, 17995)

In [11]:
# Test with full new storage approach including motif annotations
adata = tm.pp.create_seqlet_adata(
    sim_matrix,
    seqlets_df,
    seqlet_matrices=seqlets_matrix,
    oh_sequences=oh,
    contrib_scores=contrib,
    motif_collection=motif_collection,
    motif_annotations=motif_annotations,
    motif_to_dbd=motif_to_db,
)
adata

AnnData object with n_obs × n_vars = 295 × 17995
    obs: 'example_idx', 'start', 'end', 'attribution', 'p-value', 'seqlet_matrix', 'seqlet_oh', 'example_oh', 'example_contrib'
    var: 'motif_pwm', 'Direct_annot', 'Motif_similarity_annot', 'Orthology_annot', 'Motif_similarity_and_Orthology_annot', 'dbd'

In [12]:
tm.tl.cluster_seqlets(adata, resolution=3.0)

Computing PCA...
Computing neighborhood graph...


  from .autonotebook import tqdm as notebook_tqdm


Computing t-SNE embedding...
Performing Leiden clustering with resolution 3.0...
Clustering complete. Found 21 clusters.
DBD annotation coverage: 234/295 seqlets


In [13]:
adata.obs.head(3)

Unnamed: 0,example_idx,start,end,attribution,p-value,seqlet_matrix,seqlet_oh,example_oh,example_contrib,leiden,mean_contrib,seqlet_dbd,cluster_dbd
0,24,59,76,-0.467904,1.887379e-15,"[[-0.0011337847152989144, 0.039029417622873375...","[[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,...","[[0.00199191365391016, 0.00013025905354879797,...",0,0.201815,,
1,31,178,198,1.294061,0.0002467105,"[[0.22378262176327485, 0.02587442114950032, -0...","[[1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0,...","[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[-0.001131125376559794, -0.000327722518704831...",1,0.212963,,
2,28,299,318,-0.457312,0.0007206342,"[[0.09934498645434618, -0.09970337874610243, 0...","[[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,...","[[-0.0005060430848971009, -0.00117746787145733...",2,0.170461,,C2H2 ZF; Homeodomain


In [14]:
patterns = tm.tl.create_patterns(adata)

Creating patterns for 21 clusters...


In [15]:
tm.save_h5ad(adata, "sample_adata.h5ad")