In [1]:
import numpy as np
from util import read_json, read_regions
import os

In [2]:
data_folder = ".data/dlem_training_w_nan"

In [3]:
meta = read_json(os.path.join(data_folder, "meta.json"))

In [4]:
regions = read_regions(os.path.join(data_folder, "sequences.bed"), meta["RES"], meta['PATCH_SIZE']) 

In [5]:
patches = np.memmap(os.path.join(data_folder, 'contactmaps.dat'),
                    dtype='float32',
                    mode = 'r',
                    shape=(meta['SAMPLE_NUM'], meta['PATCH_LEN']))

In [6]:
features = np.memmap(os.path.join(data_folder, 'features.dat'),
                     dtype='float32',
                     mode = 'r',
                     shape=(meta['SAMPLE_NUM'], meta['FEA_DIM'], meta['PATCH_DIM']))

In [7]:
patches.shape

(5841, 6083)

In [11]:
good_samples = np.where(np.logical_not(np.any(np.isnan(patches), axis=1)))

In [9]:
features.shape

(5841, 4, 120)

In [10]:
np.any(np.isnan(features))

False

In [12]:
regions = regions.iloc[good_samples]

In [14]:
good_patches = np.array(patches)[good_samples]
good_features = np.array(features)[good_samples]

In [17]:
regions.shape

(5555, 4)

In [15]:
good_patches.shape

(5555, 6083)

In [16]:
good_features.shape

(5555, 4, 120)

In [20]:
new_data_folder = ".data/dlem_training"

In [19]:
os.makedirs(new_data_folder)

In [21]:
file_patches = np.memmap(os.path.join(new_data_folder, 'contactmaps.dat'),
                         dtype='float32',
                         mode = 'w+',
                         shape=good_patches.shape)

In [22]:
file_patches[:] = good_patches[:]

In [23]:
file_patches.flush()

In [24]:
file_features = np.memmap(os.path.join(new_data_folder, 'features.dat'),
                          dtype='float32',
                          mode = 'w+',
                          shape=good_features.shape)

In [25]:
file_features[:] = good_features[:]
file_features.flush()

In [28]:
regions.to_csv(os.path.join(new_data_folder, "sequences.bed"), header=False, index=False, sep="\t")

In [30]:
assert file_features.shape[0] == file_patches.shape[0]
assert file_features.shape[0] == regions.shape[0]

In [31]:
file_features.shape[0]

5555

In [32]:
meta["SAMPLE_NUM"] = file_features.shape[0]

In [33]:
meta

{'RES': 10000,
 'PATCH_SIZE': 120,
 'BW_FILES': ['H1_CTCF_ENCFF038RVZ.bigWig', 'H1_DNAse_ENCFF232GUZ.bigWig'],
 'SEQ_FEA_FILES': ['sequence_features_pos_10000.tsv',
  'sequence_features_neg_10000.tsv'],
 'START_DIAG': 3,
 'STOP_DIAG': 80,
 'PATCH_DIM': 120,
 'PATCH_LEN': 6083,
 'FEA_DIM': 4,
 'REGION_FILE': 'sequences.bed',
 'COOLER_FILE': '4DNFI9GMP2J8.mcool',
 'SAMPLE_NUM': 5555}

In [34]:
import json
with open(os.path.join(new_data_folder, "meta.json"), "w") as file:
    file.write(json.dumps(meta, indent=4))