# Set-up

In [37]:
import os
import logging
import json
import yaml
import pickle
import numpy as np
import pandas as pd
import xarray as xr
import seqdata as sd
import seqpro as sp

from eugene.prep_dataset.utils import (
    merge_parameters,
    infer_covariate_types,
    run_continuous_correlations,
    run_binary_correlations,
    run_categorical_correlations,
)
from eugene.prep_dataset.tracks import default_params

import polygraph.sequence
from tangermeme.match import extract_matching_loci
from tangermeme.tools.fimo import fimo
import scanpy as sc
from anndata import AnnData
from scipy.io import mmwrite

from sklearn.model_selection import KFold
from sklearn.decomposition import NMF

In [38]:
print("SeqData version: ", sd.__version__)
print("SeqPro version: ", sp.__version__)

SeqData version:  0.0.0
SeqPro version:  0.1.15


In [None]:
path_params = "/cellar/users/aklie/projects/ML4GLand/EUGENe/examples/prep_dataset/SC.delta/SC.delta.sample.yaml"
path_out = "/cellar/users/aklie/projects/ML4GLand/EUGENe/examples/prep_dataset/SC.delta/out"
overwrite = True

# Parameters

In [40]:
with open(path_params, "r") as f:
    params = yaml.safe_load(f)

params = merge_parameters(path_params, default_params)
params

{'name': 'SC.delta.sample',
 'threads': 4,
 'random_state': 1234,
 'seqdata': {'fasta': '/cellar/users/aklie/data/ref/genomes/hg38/hg38.fa',
  'seq_var': 'seq',
  'bws': ['/cellar/users/aklie/projects/ML4GLand/EUGENe/examples/prep_dataset/SC.delta/SC.delta_unstranded.bw'],
  'bw_names': ['SC.delta'],
  'cov_var': 'cov',
  'loci': '/cellar/users/aklie/projects/ML4GLand/EUGENe/examples/prep_dataset/SC.delta/SC.delta.sample.narrowPeak',
  'batch_size': 1000,
  'fixed_length': 2114,
  'target_length': 1000,
  'alphabet': 'DNA',
  'upper_case': False,
  'add_rev_comp': False,
  'max_jitter': 512,
  'overwrite': False},
 'negatives': {'gc_bin_width': 0.02,
  'max_n_perc': 0.1,
  'signal': '/cellar/users/aklie/projects/ML4GLand/EUGENe/examples/prep_dataset/SC.delta/SC.delta_unstranded.bw',
  'signal_beta': 0.5,
  'in_window': 2114,
  'out_window': 1000,
  'random_state': 1234},
 'splits': '/cellar/users/aklie/projects/ML4GLand/tutorials/data/splits/ENCODE_cross-val.json',
 'kmer_analysis': {'

In [41]:
# Infer seqpro alphabet
if params["seqdata"]["alphabet"] == "DNA":
    alphabet = sp.DNA
elif params["seqdata"]["alphabet"] == "RNA":
    alphabet = sp.RNA

In [42]:
# Grab params
name = params["name"]
threads = params["threads"]
random_state = params["random_state"]
loci = params["seqdata"]["loci"]
fasta = params["seqdata"]["fasta"]
seq_var = params["seqdata"]["seq_var"]
bws = params["seqdata"]["bws"]
bw_names = params["seqdata"]["bw_names"]
cov_var = params["seqdata"]["cov_var"]
batch_size = params["seqdata"]["batch_size"]
fixed_length = params["seqdata"]["fixed_length"]
max_jitter = params["seqdata"]["max_jitter"]

# Logging

In [43]:
# Set-up logging with date and time
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [44]:
# Log parameters
logging.info("Parameters:")
for key, value in params.items():
    logging.info(f"  {key}")
    if isinstance(value, dict):
        for key, value in value.items():
            logging.info(f"    {key}: {value}")

2024-11-17 13:32:20,664 - INFO - Parameters:
2024-11-17 13:32:20,665 - INFO -   name
2024-11-17 13:32:20,665 - INFO -   threads
2024-11-17 13:32:20,665 - INFO -   random_state
2024-11-17 13:32:20,666 - INFO -   seqdata
2024-11-17 13:32:20,666 - INFO -     fasta: /cellar/users/aklie/data/ref/genomes/hg38/hg38.fa
2024-11-17 13:32:20,666 - INFO -     seq_var: seq
2024-11-17 13:32:20,666 - INFO -     bws: ['/cellar/users/aklie/projects/ML4GLand/EUGENe/examples/prep_dataset/SC.delta/SC.delta_unstranded.bw']
2024-11-17 13:32:20,667 - INFO -     bw_names: ['SC.delta']
2024-11-17 13:32:20,667 - INFO -     cov_var: cov
2024-11-17 13:32:20,667 - INFO -     loci: /cellar/users/aklie/projects/ML4GLand/EUGENe/examples/prep_dataset/SC.delta/SC.delta.sample.narrowPeak
2024-11-17 13:32:20,668 - INFO -     batch_size: 1000
2024-11-17 13:32:20,668 - INFO -     fixed_length: 2114
2024-11-17 13:32:20,669 - INFO -     target_length: 1000
2024-11-17 13:32:20,669 - INFO -     alphabet: DNA
2024-11-17 13:32:2

# Load SeqData

In [45]:
out = os.path.join(path_out, f"{name}.seqdata")
logging.info(f"Writing to {out}")

2024-11-17 13:32:21,712 - INFO - Writing to /cellar/users/aklie/projects/ML4GLand/EUGENe/examples/prep_dataset/SC.delta/SC.delta/SC.delta.sample.seqdata


In [None]:
sdata_loci = sd.from_region_files(
    sd.GenomeFASTA(
        seq_var,
        fasta,
        batch_size=batch_size,
        n_threads=threads,
    ),
    sd.BigWig(
        cov_var,
        bws,
        bw_names,
        batch_size=batch_size,
        n_jobs=threads,
        threads_per_job=len(bws),
    ),
    path=out,
    fixed_length=fixed_length,
    bed=loci,
    overwrite=overwrite,
    max_jitter=max_jitter,
)
sdata_loci["type"] = xr.DataArray(["region"] * sdata_loci.dims["_sequence"], dims=["_sequence"])
sdata_loci.coords["_sequence"] = np.array([f"region_{i}" for i in range(sdata_loci.dims["_sequence"])])
sdata_loci

100%|██████████| 1000/1000 [00:00<00:00, 30067.99it/s]
100%|██████████| 1000/1000 [00:02<00:00, 342.26it/s]
  sdata_loci["type"] = xr.DataArray(["region"] * sdata_loci.dims["_sequence"], dims=["_sequence"])
  sdata_loci.coords["_sequence"] = np.array([f"region_{i}" for i in range(sdata_loci.dims["_sequence"])])


Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type int64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type int64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,11.97 MiB,11.97 MiB
Shape,"(1000, 1, 3138)","(1000, 1, 3138)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.97 MiB 11.97 MiB Shape (1000, 1, 3138) (1000, 1, 3138) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",3138  1  1000,

Unnamed: 0,Array,Chunk
Bytes,11.97 MiB,11.97 MiB
Shape,"(1000, 1, 3138)","(1000, 1, 3138)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type int64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.99 MiB,2.99 MiB
Shape,"(1000, 3138)","(1000, 3138)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 2.99 MiB 2.99 MiB Shape (1000, 3138) (1000, 3138) Dask graph 1 chunks in 2 graph layers Data type |S1 numpy.ndarray",3138  1000,

Unnamed: 0,Array,Chunk
Bytes,2.99 MiB,2.99 MiB
Shape,"(1000, 3138)","(1000, 3138)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray


# Negatives

In [47]:
negative_loci = extract_matching_loci(
	loci=loci,
	fasta=fasta,
	gc_bin_width=params["negatives"]["gc_bin_width"],
	max_n_perc=params["negatives"]["max_n_perc"],
	bigwig=params["negatives"]["signal"],
	signal_beta=params["negatives"]["signal_beta"],
	in_window=params["negatives"]["in_window"],
	out_window=params["negatives"]["out_window"],
	chroms=None,
	random_state=random_state,
	verbose=True
)
negative_loci.head()

Processing given loci.


Getting counts: 999it [00:00, 20925.65it/s]
Getting N percentages: 999it [00:00, 8282.98it/s]
Getting GC percentages: 999it [00:00, 8678.37it/s]
Getting background GC: 100%|██████████| 24/24 [00:19<00:00,  1.22it/s]


GC Bin	Background Count	Peak Count	Chosen Count
0.00:        0	       0	       0
0.02:        0	       0	       0
0.04:        1	       0	       0
0.06:        6	       0	       0
0.08:       11	       0	       0
0.10:       15	       0	       0
0.12:       21	       0	       0
0.14:       31	       0	       0
0.16:       44	       0	       0
0.18:       41	       0	       0
0.20:       77	       0	       0
0.22:      135	       0	       0
0.24:      256	       0	       0
0.26:     1393	       0	       0
0.28:     8371	       2	       2
0.30:    31207	       2	       2
0.32:    72204	       9	       9
0.34:   120919	      20	      20
0.36:   164038	      29	      29
0.38:   186880	      51	      51
0.40:   161881	      67	      67
0.42:   122096	      73	      73
0.44:    91926	      57	      57
0.46:    66402	      66	      66
0.48:    41538	      63	      63
0.50:    23875	      64	      64
0.52:    13142	      66	      66
0.54:     7336	      77	      77
0.56:     4334	      57	    

Getting counts: 999it [00:00, 22067.73it/s]

Peak Robust Signal Minimum: 13.0
Matched Signal Maximum: 6.0





Unnamed: 0,chrom,start,end
630,chr1,769496,771610
927,chr1,2798936,2801050
924,chr1,2801050,2803164
900,chr1,2807392,2809506
666,chr1,2872926,2875040


In [48]:
# Write negative loci to bed file
negatives_bed = os.path.join(path_out, f"{name}.negatives.bed")
negative_loci.to_csv(negatives_bed, sep="\t", header=False, index=False)

In [49]:
# Define negative seqdata out path
negatives_out = os.path.join(path_out, f"{name}.negatives.seqdata")

In [50]:
# Build SeqData from negatives
sdata_neg = sd.from_region_files(
    sd.GenomeFASTA(
        seq_var,
        fasta,
        batch_size=batch_size,
        n_threads=threads,
    ),
    sd.BigWig(
        cov_var,
        bws,
        bw_names,
        batch_size=batch_size,
        n_jobs=threads,
        threads_per_job=len(bws),
    ),
    path=negatives_out,
    fixed_length=fixed_length,
    bed=negatives_bed,
    overwrite=True,
    max_jitter=max_jitter,
)
sdata_neg["type"] = xr.DataArray(["negative"] * sdata_neg.dims["_sequence"], dims=["_sequence"])
sdata_neg.coords["_sequence"] = np.array([f"negative_{i}" for i in range(sdata_neg.dims["_sequence"])])
sdata_neg

100%|██████████| 1000/1000 [00:00<00:00, 33734.17it/s]
100%|██████████| 1000/1000 [00:00<00:00, 10229.61it/s]
  sdata_neg["type"] = xr.DataArray(["negative"] * sdata_neg.dims["_sequence"], dims=["_sequence"])
  sdata_neg.coords["_sequence"] = np.array([f"negative_{i}" for i in range(sdata_neg.dims["_sequence"])])


Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type int64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type int64 numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,11.97 MiB,11.97 MiB
Shape,"(1000, 1, 3138)","(1000, 1, 3138)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 11.97 MiB 11.97 MiB Shape (1000, 1, 3138) (1000, 1, 3138) Dask graph 1 chunks in 2 graph layers Data type float32 numpy.ndarray",3138  1  1000,

Unnamed: 0,Array,Chunk
Bytes,11.97 MiB,11.97 MiB
Shape,"(1000, 1, 3138)","(1000, 1, 3138)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.99 MiB,2.99 MiB
Shape,"(1000, 3138)","(1000, 3138)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 2.99 MiB 2.99 MiB Shape (1000, 3138) (1000, 3138) Dask graph 1 chunks in 2 graph layers Data type |S1 numpy.ndarray",3138  1000,

Unnamed: 0,Array,Chunk
Bytes,2.99 MiB,2.99 MiB
Shape,"(1000, 3138)","(1000, 3138)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 7.81 kiB 7.81 kiB Shape (1000,) (1000,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",1000  1,

Unnamed: 0,Array,Chunk
Bytes,7.81 kiB,7.81 kiB
Shape,"(1000,)","(1000,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray


# Concatenate (if necessary)

In [51]:
sdata = xr.concat([sdata_loci, sdata_neg], dim="_sequence")
sdata

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 15.62 kiB 7.81 kiB Shape (2000,) (1000,) Dask graph 2 chunks in 5 graph layers Data type object numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 7.81 kiB Shape (2000,) (1000,) Dask graph 2 chunks in 5 graph layers Data type int64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 7.81 kiB Shape (2000,) (1000,) Dask graph 2 chunks in 5 graph layers Data type int64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,23.94 MiB,11.97 MiB
Shape,"(2000, 1, 3138)","(1000, 1, 3138)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 23.94 MiB 11.97 MiB Shape (2000, 1, 3138) (1000, 1, 3138) Dask graph 2 chunks in 5 graph layers Data type float32 numpy.ndarray",3138  1  2000,

Unnamed: 0,Array,Chunk
Bytes,23.94 MiB,11.97 MiB
Shape,"(2000, 1, 3138)","(1000, 1, 3138)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type object numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 8 graph layers,1 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 8 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 8 graph layers,1 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.99 MiB,2.99 MiB
Shape,"(2000, 3138)","(1000, 3138)"
Dask graph,2 chunks in 7 graph layers,2 chunks in 7 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 5.99 MiB 2.99 MiB Shape (2000, 3138) (1000, 3138) Dask graph 2 chunks in 7 graph layers Data type |S1 numpy.ndarray",3138  2000,

Unnamed: 0,Array,Chunk
Bytes,5.99 MiB,2.99 MiB
Shape,"(2000, 3138)","(1000, 3138)"
Dask graph,2 chunks in 7 graph layers,2 chunks in 7 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 15.62 kiB 7.81 kiB Shape (2000,) (1000,) Dask graph 2 chunks in 5 graph layers Data type object numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,object numpy.ndarray,object numpy.ndarray


In [None]:
# # Concatenate the two datasets
# sdata = xr.concat([sdata_loci, sdata_neg], dim="_sequence")
# # https://github.com/pydata/xarray/issues/3476#issuecomment-1115045538
# for v in list(sdata.coords.keys()):
#     if sdata.coords[v].dtype == object:
#         sdata.coords[v] = sdata.coords[v].astype("unicode")
# for v in list(sdata.variables.keys()):
#     if sdata[v].dtype == object:
#         sdata[v] = sdata[v].astype("unicode")
# sdata

  common_dims = tuple(pd.unique([d for v in vars for d in v.dims]))
  common_dims = tuple(pd.unique([d for v in vars for d in v.dims]))
  common_dims = tuple(pd.unique([d for v in vars for d in v.dims]))
  common_dims = tuple(pd.unique([d for v in vars for d in v.dims]))
  common_dims = tuple(pd.unique([d for v in vars for d in v.dims]))
  common_dims = tuple(pd.unique([d for v in vars for d in v.dims]))
  common_dims = tuple(pd.unique([d for v in vars for d in v.dims]))


Unnamed: 0,Array,Chunk
Bytes,2.39 MiB,153.02 kiB
Shape,"(626692,)","(39174,)"
Dask graph,16 chunks in 6 graph layers,16 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 2.39 MiB 153.02 kiB Shape (626692,) (39174,) Dask graph 16 chunks in 6 graph layers Data type",626692  1,

Unnamed: 0,Array,Chunk
Bytes,2.39 MiB,153.02 kiB
Shape,"(626692,)","(39174,)"
Dask graph,16 chunks in 6 graph layers,16 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,306.05 kiB
Shape,"(626692,)","(39174,)"
Dask graph,16 chunks in 5 graph layers,16 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 4.78 MiB 306.05 kiB Shape (626692,) (39174,) Dask graph 16 chunks in 5 graph layers Data type int64 numpy.ndarray",626692  1,

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,306.05 kiB
Shape,"(626692,)","(39174,)"
Dask graph,16 chunks in 5 graph layers,16 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,306.05 kiB
Shape,"(626692,)","(39174,)"
Dask graph,16 chunks in 5 graph layers,16 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 4.78 MiB 306.05 kiB Shape (626692,) (39174,) Dask graph 16 chunks in 5 graph layers Data type int64 numpy.ndarray",626692  1,

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,306.05 kiB
Shape,"(626692,)","(39174,)"
Dask graph,16 chunks in 5 graph layers,16 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,7.33 GiB,11.97 MiB
Shape,"(626692, 1, 3138)","(1000, 1, 3138)"
Dask graph,628 chunks in 5 graph layers,628 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 7.33 GiB 11.97 MiB Shape (626692, 1, 3138) (1000, 1, 3138) Dask graph 628 chunks in 5 graph layers Data type float32 numpy.ndarray",3138  1  626692,

Unnamed: 0,Array,Chunk
Bytes,7.33 GiB,11.97 MiB
Shape,"(626692, 1, 3138)","(1000, 1, 3138)"
Dask graph,628 chunks in 5 graph layers,628 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.39 MiB,1.34 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 8 graph layers,8 chunks in 8 graph layers
Data type,,
"Array Chunk Bytes 2.39 MiB 1.34 MiB Shape (626692,) (352474,) Dask graph 8 chunks in 8 graph layers Data type",626692  1,

Unnamed: 0,Array,Chunk
Bytes,2.39 MiB,1.34 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 8 graph layers,8 chunks in 8 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 7 graph layers,8 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.78 MiB 2.69 MiB Shape (626692,) (352474,) Dask graph 8 chunks in 7 graph layers Data type float64 numpy.ndarray",626692  1,

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 7 graph layers,8 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 8 graph layers,8 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.78 MiB 2.69 MiB Shape (626692,) (352474,) Dask graph 8 chunks in 8 graph layers Data type float64 numpy.ndarray",626692  1,

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 8 graph layers,8 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 7 graph layers,8 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.78 MiB 2.69 MiB Shape (626692,) (352474,) Dask graph 8 chunks in 7 graph layers Data type float64 numpy.ndarray",626692  1,

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 7 graph layers,8 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 7 graph layers,8 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.78 MiB 2.69 MiB Shape (626692,) (352474,) Dask graph 8 chunks in 7 graph layers Data type float64 numpy.ndarray",626692  1,

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 7 graph layers,8 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.83 GiB,2.99 MiB
Shape,"(626692, 3138)","(1000, 3138)"
Dask graph,628 chunks in 5 graph layers,628 chunks in 5 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 1.83 GiB 2.99 MiB Shape (626692, 3138) (1000, 3138) Dask graph 628 chunks in 5 graph layers Data type |S1 numpy.ndarray",3138  626692,

Unnamed: 0,Array,Chunk
Bytes,1.83 GiB,2.99 MiB
Shape,"(626692, 3138)","(1000, 3138)"
Dask graph,628 chunks in 5 graph layers,628 chunks in 5 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 7 graph layers,8 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 4.78 MiB 2.69 MiB Shape (626692,) (352474,) Dask graph 8 chunks in 7 graph layers Data type float64 numpy.ndarray",626692  1,

Unnamed: 0,Array,Chunk
Bytes,4.78 MiB,2.69 MiB
Shape,"(626692,)","(352474,)"
Dask graph,8 chunks in 7 graph layers,8 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.39 MiB,153.02 kiB
Shape,"(626692,)","(39174,)"
Dask graph,16 chunks in 6 graph layers,16 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 2.39 MiB 153.02 kiB Shape (626692,) (39174,) Dask graph 16 chunks in 6 graph layers Data type",626692  1,

Unnamed: 0,Array,Chunk
Bytes,2.39 MiB,153.02 kiB
Shape,"(626692,)","(39174,)"
Dask graph,16 chunks in 6 graph layers,16 chunks in 6 graph layers
Data type,,


# Splits

In [52]:
splits = params["splits"]

In [53]:
# Read splits
with open(params["splits"], "r") as infile:
    splits = json.load(infile)

In [54]:
# Turn the above structure into a dataframe where rows are chromosomes and columns are the fold
unique_chroms = set()
folds = []
for fold in splits:
    unique_chroms.update(splits[fold]["train"])
    unique_chroms.update(splits[fold]["valid"])
    unique_chroms.update(splits[fold]["test"])
    folds.append(fold)
df = pd.DataFrame(index=sorted(unique_chroms), columns=sorted(folds))
for fold in splits:
    for chrom in splits[fold]["train"]:
        df.loc[chrom, fold] = "train"
    for chrom in splits[fold]["valid"]:
        df.loc[chrom, fold] = "valid"
    for chrom in splits[fold]["test"]:
        df.loc[chrom, fold] = "test"

# Create dictionary where keys are folds and values are numpy arrays of splits for each sequence in sdata
split_mp = {}
for fold in sorted(splits):
    sdata[fold] = xr.DataArray(np.array([df.loc[chrom, fold] for chrom in sdata.chrom.values]), dims=["_sequence"])
sdata

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 15.62 kiB 7.81 kiB Shape (2000,) (1000,) Dask graph 2 chunks in 5 graph layers Data type object numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 7.81 kiB Shape (2000,) (1000,) Dask graph 2 chunks in 5 graph layers Data type int64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 7.81 kiB Shape (2000,) (1000,) Dask graph 2 chunks in 5 graph layers Data type int64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,23.94 MiB,11.97 MiB
Shape,"(2000, 1, 3138)","(1000, 1, 3138)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 23.94 MiB 11.97 MiB Shape (2000, 1, 3138) (1000, 1, 3138) Dask graph 2 chunks in 5 graph layers Data type float32 numpy.ndarray",3138  1  2000,

Unnamed: 0,Array,Chunk
Bytes,23.94 MiB,11.97 MiB
Shape,"(2000, 1, 3138)","(1000, 1, 3138)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type object numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 8 graph layers,1 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 8 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 8 graph layers,1 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.99 MiB,2.99 MiB
Shape,"(2000, 3138)","(1000, 3138)"
Dask graph,2 chunks in 7 graph layers,2 chunks in 7 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 5.99 MiB 2.99 MiB Shape (2000, 3138) (1000, 3138) Dask graph 2 chunks in 7 graph layers Data type |S1 numpy.ndarray",3138  2000,

Unnamed: 0,Array,Chunk
Bytes,5.99 MiB,2.99 MiB
Shape,"(2000, 3138)","(1000, 3138)"
Dask graph,2 chunks in 7 graph layers,2 chunks in 7 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.62 kiB 15.62 kiB Shape (2000,) (2000,) Dask graph 1 chunks in 7 graph layers Data type float64 numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,15.62 kiB
Shape,"(2000,)","(2000,)"
Dask graph,1 chunks in 7 graph layers,1 chunks in 7 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 15.62 kiB 7.81 kiB Shape (2000,) (1000,) Dask graph 2 chunks in 5 graph layers Data type object numpy.ndarray",2000  1,

Unnamed: 0,Array,Chunk
Bytes,15.62 kiB,7.81 kiB
Shape,"(2000,)","(1000,)"
Dask graph,2 chunks in 5 graph layers,2 chunks in 5 graph layers
Data type,object numpy.ndarray,object numpy.ndarray


# Save minimal SeqData

In [55]:
if os.path.exists(out.replace(".seqdata", ".minimal.seqdata")):
    if overwrite:
        import shutil
        logging.info("Removing existing minimal SeqData")
        shutil.rmtree(out.replace(".seqdata", ".minimal.seqdata"))
    else:
        raise ValueError("Minimal SeqData already exists. Set overwrite to true in config to overwrite.")
sd.to_zarr(sdata, out.replace(".seqdata", ".minimal.seqdata"))

  sdata.to_zarr(


In [56]:
# Save
sd.to_zarr(sdata, out.replace(".seqdata", ".minimal.seqdata"), mode="w")

  sdata.to_zarr(


In [57]:
# Test for corruption
sdata = sd.open_zarr(out.replace(".seqdata", ".minimal.seqdata"))
sdata["seq"][0].values

array([b'T', b'T', b'T', ..., b'G', b'T', b'G'], dtype='|S1')

# One-hot encode

In [26]:
sdata["ohe"] = xr.DataArray(sp.ohe(sdata[seq_var].values, alphabet=alphabet), dims=["_sequence", "_length", "_alphabet"])
sdata.coords["_alphabet"] = alphabet.array

# Sequence analysis pipeline

## Get seqs, targets and metadata

In [27]:
target_length = params["seqdata"]["target_length"]

In [28]:
seqs_start = (sdata.dims["_length"] // 2) - (fixed_length // 2)
counts_start = (sdata.dims["_length"] // 2) - (target_length // 2)
seqs_start, counts_start

(512, 1069)

In [29]:
# Get the number of sequences and the fixed length of each sequence
seqs = sdata[seq_var].values
seqs = seqs[:, seqs_start:seqs_start + fixed_length]
dims = seqs.shape

In [30]:
seqs.shape

(2000, 2114)

In [31]:
if len(dims) == 2:
    seqs = seqs.view('S{}'.format(dims[1])).ravel().astype(str)
seqs.shape

(2000,)

In [32]:
# Get all vars that are not the sequence or the coverage
other_vars = [v for v in sdata.variables if v not in [seq_var, cov_var, "ohe"]]

In [33]:
metadata = sdata.drop_vars([seq_var, cov_var, "ohe", "_alphabet", "cov_sample"]).to_dataframe()
metadata

Unnamed: 0_level_0,chrom,chromEnd,chromStart,name,pValue,peak,qValue,score,signalValue,strand,type,fold_0,fold_1,fold_2,fold_3,fold_4
_sequence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
peak_0,chr1,1835244,1832106,.,15.20758,140.0,12.776564,127.0,6.818182,.,peak,test,train,train,train,train
peak_1,chr1,1945729,1942591,.,38.89975,180.0,36.138088,361.0,10.392157,.,peak,test,train,train,train,train
peak_2,chr1,2548599,2545461,.,12.12079,155.0,9.762530,97.0,2.902241,.,peak,test,train,train,train,train
peak_3,chr1,3942178,3939040,.,8.38514,149.0,6.141801,61.0,5.371901,.,peak,test,train,train,train,train
peak_4,chr1,9006865,9003727,.,13.29031,117.0,10.903119,109.0,6.052632,.,peak,test,train,train,train,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
negative_995,chrY,18331006,18327868,,,,,,,+,negative,train,train,test,train,train
negative_996,chrY,22647794,22644656,,,,,,,+,negative,train,train,test,train,train
negative_997,chrY,23085392,23082254,,,,,,,+,negative,train,train,test,train,train
negative_998,chrY,25226874,25223736,,,,,,,+,negative,train,train,test,train,train


In [34]:
covariate_types = infer_covariate_types(metadata)

In [35]:
covariate_types

{'chrom': 'categorical',
 'chromEnd': 'continuous',
 'chromStart': 'continuous',
 'name': 'binary',
 'pValue': 'continuous',
 'peak': 'continuous',
 'qValue': 'continuous',
 'score': 'continuous',
 'signalValue': 'continuous',
 'strand': 'binary',
 'type': 'binary',
 'fold_0': 'categorical',
 'fold_1': 'categorical',
 'fold_2': 'categorical',
 'fold_3': 'categorical',
 'fold_4': 'categorical'}

## Sequence length distributions

In [36]:
sdata["length"] = xr.DataArray(sp.length(seqs), dims=["_sequence"])
covariate_types["length"] = "continuous"

## Character distributions

In [37]:
# Get unique characters in the sequences with numpy
unique_chars = np.unique(list("".join(seqs)))
unique_chars

array(['A', 'C', 'G', 'T'], dtype='<U1')

In [38]:
sdata["alphabet_cnt"] = xr.DataArray(sp.nucleotide_content(seqs, normalize=False, alphabet=alphabet, length_axis=-1), dims=["_sequence", "_alphabet"])
sdata["non_alphabet_cnt"] = sdata["length"] - sdata["alphabet_cnt"].sum(axis=-1)
if params["seqdata"]["alphabet"] == "DNA" or params["seqdata"]["alphabet"] == "RNA":
    sdata["gc_percent"] = sdata["alphabet_cnt"].sel(_alphabet=[b"G", b"C"]).sum(axis=-1) / sdata["length"]
    covariate_types["gc_percent"] = "continuous"

## Summed counts

In [39]:
cov = sdata[cov_var].values
cov = cov[..., counts_start:counts_start + target_length]
cov.shape

(2000, 1, 1000)

In [40]:
total_counts = cov.sum(axis=(1,2))

In [41]:
sdata["total_counts"] = xr.DataArray(total_counts, dims=["_sequence"])

In [42]:
sdata

## K-mer distribution analysis

In [43]:
import polygraph.sequence
import seqpro as sp
from seqpro._analyzers import count_kmers_seq

In [44]:
ks = params["kmer_analysis"]["k"]
normalize = params["kmer_analysis"]["normalize"]

In [45]:
selected_covariates = ['chrom', 'type', 'strand', 'score']

In [48]:
# Structure of output is nested dictionary with 
# level 1 keys: kmer length, level 1 values: dictionary with
# level 2 keys: covariate type, level 2 values: dictionary with
# level 3 keys: covariate name, level 3 values: pandas DataFrame with stats
kmer_res = {}
for k in ks:

    # Compute the k-mer frequencies
    kmers = polygraph.sequence.kmer_frequencies(seqs=seqs.tolist(), k=k, normalize=False)

    # Add the k-mer counts to the seqdata
    sdata[f"{k}mer_cnt"] = xr.DataArray(kmers.values, dims=["_sequence", f"_{k}mer"])
    sdata.coords[f"_{k}mer"] = kmers.columns

    # If normalize, normalize the k-mer counts by sequence lengths
    if normalize:
        kmers = kmers.div(sdata["length"].values - k + 1, axis=0)

    # Run PCA on the k-mer counts
    ad = AnnData(kmers, obs=sdata[covariate_types.keys()].to_pandas(), var=sdata[f"_{k}mer"].to_pandas().index.to_frame().drop(f"_{k}mer", axis=1))
    ad = ad[:, ad.X.sum(0) > 0]
    sc.pp.pca(ad)
    ad.write_h5ad(f"{out.replace('.seqdata', '')}.{k}mer.h5ad")

    # For each covariate, run correlations with each k-mer count
    continuous_res = {}
    binary_res = {}
    categorical_res = {}
    diff_res = {}
    for covariate in selected_covariates:
        print(f"Running correlations for {k}-mers with {covariate}")
        # For each continuous variable, run correlations with each count
        if covariate_types[covariate] == "continuous":
            corrs, pvals = run_continuous_correlations(
                cnts=sdata[f"{k}mer_cnt"].values,
                covariate=sdata[covariate].values,
                method="pearson",
            )
            continuous_res[covariate] = pd.DataFrame(
                {
                    f"{k}mer": sdata.coords[f"_{k}mer"].values,
                    "corr": corrs,
                    "pval": pvals,
                }
            )
            continuous_res[covariate] = continuous_res[covariate].sort_values("corr", ascending=False)

        # For each binary variable, run correlations with each count
        elif covariate_types[covariate] == "binary":
            covariate_ = sdata[covariate].values
            covariate_ = np.where(covariate_ == covariate_[0], 0, 1)
            corrs, pvals = run_binary_correlations(
                cnts=sdata[f"{k}mer_cnt"].values,
                binary=covariate_,
                method="mannwhitneyu",
            )
            binary_res[covariate] = pd.DataFrame(
                {
                    f"{k}mer": sdata.coords[f"_{k}mer"].values,
                    "corr": corrs,
                    "pval": pvals,
                }
            )
            binary_res[covariate] = binary_res[covariate].sort_values("corr", ascending=False)

        # For each categorical variable, run correlations with each count
        elif covariate_types[covariate] == "categorical":

            # Run the correlation
            corrs, pvals = run_categorical_correlations(
                cnts=sdata[f"{k}mer_cnt"].values,
                categorical=sdata[covariate].values,
                method="kruskal",
            )
            categorical_res[covariate] = pd.DataFrame(
                {
                    f"{k}mer": sdata.coords[f"_{k}mer"].values,
                    "corr": corrs,
                    "pval": pvals,
                }
            )
            categorical_res[covariate] = categorical_res[covariate].sort_values("corr", ascending=False)
        
            # Run the differential analysis
            sc.tl.rank_genes_groups(
                ad,
                groupby=covariate,
                groups="all",
                reference="rest",
                rankby_abs=True,
                method="wilcoxon",
            )
            
            # Get the variable names
            diff = pd.DataFrame(ad.uns["rank_genes_groups"]["names"]).melt(var_name="group")

            # Get the statistics
            diff["score"] = pd.DataFrame(ad.uns["rank_genes_groups"]["scores"]).melt()["value"]
            diff["padj"] = pd.DataFrame(ad.uns["rank_genes_groups"]["pvals_adj"]).melt()["value"]
            diff["log2FC"] = pd.DataFrame(ad.uns["rank_genes_groups"]["logfoldchanges"]).melt()["value"]
            diff_res[covariate] = diff

    # Add to results
    kmer_res[k] = {
        "continuous": continuous_res,
        "binary": binary_res,
        "categorical": categorical_res,
        "diff": diff_res,
    }

  adata.obsm[key_obsm] = X_pca


Running correlations for 5-mers with chrom


... storing 'chrom' as categorical
... storing 'name' as categorical
... storing 'strand' as categorical
... storing 'type' as categorical
... storing 'fold_0' as categorical
... storing 'fold_1' as categorical
... storing 'fold_2' as categorical
... storing 'fold_3' as categorical
... storing 'fold_4' as categorical
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  se

Running correlations for 5-mers with type
Running correlations for 5-mers with strand
Running correlations for 5-mers with score


In [80]:
corrs, pvals = run_continuous_correlations(
    cnts=sdata[f"5mer_cnt"].values,
    covariate=sdata["score"].values,
    method="pearson",
)

In [53]:
kmer_res[5]["binary"]['type'].iloc[:10]

Unnamed: 0,5mer,corr,pval
490,CTGGG,3878500.0,0.0
338,CCCAG,3874000.0,0.0
378,CCTGG,3841500.0,0.0
330,CCAGG,3828500.0,0.0
936,TGGGA,3818500.0,0.0
852,TCCCA,3817000.0,0.0
482,CTGAG,3816500.0,0.0
290,CAGAG,3815000.0,0.0
296,CAGGA,3812000.0,0.0
914,TGCAG,3810500.0,0.0


## Motif analysis

In [54]:
from tangermeme.tools.fimo import fimo

In [57]:
meme_file = params["motif_analysis"]["motif_database"]
sig = float(params["motif_analysis"]["sig"])

In [58]:
# Perform FIMO
X = sp.ohe(seqs, alphabet=alphabet).transpose(0, 2, 1)
hits = fimo(meme_file, X) 

# Count up significant occurences of motif
motif_match_df = pd.concat([hit for hit in hits])
motif_match_df_ = motif_match_df.loc[motif_match_df["p-value"] < sig]
print(f"There are {motif_match_df_.shape[0]} significant motif matches.")
motif_match_df_ = motif_match_df.value_counts(subset=['sequence_name', "motif_name"]).reset_index()
motif_match_df_.columns = ['sequence_name', "motif_name", 'motif_count']
motif_match_df_ = motif_match_df_.pivot(index='sequence_name', columns="motif_name", values='motif_count')
motif_count_df = pd.DataFrame(index=range(len(seqs)), columns=motif_match_df_.columns)
motif_count_df.loc[motif_match_df_.index.values] = motif_match_df_
motif_count_df = motif_count_df.fillna(0)

# Add to seqdata
sdata["motif_cnt"] = xr.DataArray(motif_count_df.values, dims=["_sequence", "_motif"])
sdata.coords["_motif"] = motif_count_df.columns.values
sdata.attrs["motif_database"] = meme_file

# If normalize, normalize the motif counts by sequence lengths
if normalize:
    motif_count_df = motif_count_df.div(sdata["length"].values, axis=0)

# Run PCA on the motif counts
motif_ad = AnnData(motif_count_df.values, obs=sdata[covariate_types.keys()].to_pandas(), var=pd.DataFrame(index=sdata.coords["_motif"].values))
motif_ad = motif_ad[:, motif_ad.X.sum(0) > 0]
sc.pp.pca(motif_ad)
motif_ad.write_h5ad(f"{out.replace('.seqdata', '')}.motif.h5ad")

There are 1621 significant motif matches.


  motif_count_df = motif_count_df.fillna(0)
  adata.obsm[key_obsm] = X_pca


## NMF

In [61]:
# normalize counts by sequence length
n_components = params["motif_analysis"]["n_components"]

In [62]:
# Run NMF
model = NMF(n_components=n_components, init="random", random_state=0)

# Obtain W and H matrices
W = pd.DataFrame(model.fit_transform(motif_count_df.values))  # seqs x factors
H = pd.DataFrame(model.components_)  # factors x motifs

# Format W and H matrices
factors = [f"factor_{i}" for i in range(n_components)]
W.index = sdata["_sequence"].values
W.columns = factors
H.index = factors
H.columns = sdata["_motif"].values

W.shape, H.shape

((2000, 10), (10, 3))

In [63]:
sdata["seq_scores"] = xr.DataArray(W.values, dims=["_sequence", "_factor"])
sdata["motif_loadings"] = xr.DataArray(H.values, dims=["_factor", "_motif"])
sdata.coords["_factor"] = factors

# Save

In [64]:
sd.to_zarr(sdata, out.replace(".seqdata", ".full.seqdata"))

In [65]:
metadata.to_csv(out.replace(".seqdata", ".metadata.csv"))

In [66]:
for k in ks:
    kmer_cnt = sdata[f"{k}mer_cnt"].values
    mmwrite(out.replace(".seqdata", f".{k}mer_cnt.mtx"), kmer_cnt)
    pd.DataFrame(sdata.coords[f"_{k}mer"].values).to_csv(out.replace(".seqdata", f".{k}mers.tsv.gz"), sep="\t", index=False, header=False, compression="gzip")
    pd.DataFrame(sdata["_sequence"].values).to_csv(out.replace(".seqdata", f".seqs.tsv.gz"), sep="\t", index=False, header=False, compression="gzip")

In [67]:
motif_cnt = sdata["motif_cnt"].values
mmwrite(out.replace(".seqdata", ".motif_cnt.mtx"), motif_cnt)
pd.DataFrame(sdata.coords["_motif"].values).to_csv(out.replace(".seqdata", ".motifs.tsv.gz"), sep="\t", index=False, header=False, compression="gzip")

# DONE!

---

In [None]:
# Get every path with .json extension in the splits directory
jsons = [os.path.join(splits, f) for f in os.listdir(splits) if f.endswith(".json")]
jsons

['/cellar/users/aklie/projects/ML4GLand/tutorials/data/splits/fold_1.json',
 '/cellar/users/aklie/projects/ML4GLand/tutorials/data/splits/fold_0.json',
 '/cellar/users/aklie/projects/ML4GLand/tutorials/data/splits/fold_4.json',
 '/cellar/users/aklie/projects/ML4GLand/tutorials/data/splits/fold_2.json',
 '/cellar/users/aklie/projects/ML4GLand/tutorials/data/splits/fold_3.json']

In [None]:
# Read json
splits_dict = {}
for j in jsons:
    name = j.split("/")[-1].split(".")[0]
    with open(os.path.join(splits, j), "r") as f:
        splits_dict[name] = json.load(f)

In [None]:
# Save this as one big json
with open(os.path.join(splits, "splits.json"), "w") as f:
    json.dump(splits_dict, f, indent=4)