In [11]:
import os
from abc import ABC
from datetime import datetime
from random import randint
from typing import Optional

import numpy as np


class DataPreparation(ABC):
    """Create cube slices to train model."""

    def __init__(
        self,
        seis_filepath: str,
        mask_filepath: str,
        dataset_size: int,
        cube_size: tuple,
        dataset_type: str = 'train',
        threshold_percent: Optional[float] = None,
        threshold_value: Optional[int] = None,
    ):
        self.seis_filepath = seis_filepath
        self.mask_filepath = mask_filepath
        self.dataset_size = dataset_size
        self.cube_size = cube_size
        self.dataset_type = dataset_type
        assert dataset_type in ['train', 'test', 'eval']
        if threshold_percent is None and threshold_value is None:
            raise TypeError(
                "You should specify threshold_percent or " +
                "threshold_value"
            )
        if threshold_percent <= 0 or threshold_percent >= 1:
            raise TypeError("threshold_percent should be between 0 and 1")
        self.threshold_percent = threshold_percent
        self.threshold_value = self._get_threshold_value(threshold_value)

    def _get_threshold_value(
        self,
        threshold_value: int,
    ) -> int:
        """
        Specify threshold value.

        If we get threshold value then use it.
        If doesn't then we calculate using cube cube_size and threshold_percent
        Args:
            threshold_value: amount of pixel that are fault

        Returns:
            min value of amount of fault pixel in cube
        """
        if threshold_value is None:
            return ((self.cube_size[0] * self.cube_size[1] * self.cube_size[2]) * self.threshold_percent)
        return threshold_value

    def _get_slice_coordinates(
        self,
        faultfile,
        threshold_value: int = None,
        is_empty:bool = False,
    ) -> tuple[int, int, int]:
        """Find сoordinates of cube by threshold.
        
        Get random XYZ point and get cube based on point and cube_size
        Check amount fault pixel and return coordinate if it contains equal or
        higher than threshold
        """
        count = 0
        if threshold_value is None:
            threshold_value = 0
        Z, XL, IL = faultfile['arr_0'].shape
        while True:
            count += 1
            zline = randint(0, Z - cube_size[2])
            iline = randint(0, IL - cube_size[0])
            xline = randint(0, XL - cube_size[1])
            fault_slice = faultfile['arr_0'][
                zline: zline + cube_size[2],
                iline: iline + cube_size[0],
                xline: xline + cube_size[1],
            ]
            if is_empty:
                if fault_slice.sum() == threshold_value:
                    return iline, xline, zline
            else:
                if fault_slice.sum() > threshold_value:
                    return iline, xline, zline
            if count >= 40:
                count = 0
                print(
                    'Very high density value. Reduce by 10%',
                    threshold_value,
                    round(threshold_value/(self.cube_size[0] * self.cube_size[1] * self.cube_size[2]), 3)
                )
                threshold_value = int(0.9 * threshold_value)
    
    def _save_cube(
        self,
        index: int,
        iline: int,
        xline: int,
        zline: int,
        maskfile,
        seisfile,
    ):
        np.savez(f'data_128\\{self.dataset_type}\\seis\\seis_{index}.npz', seisfile['arr_0'][
                zline: zline + cube_size[2],
                iline: iline + cube_size[0],
                xline: xline + cube_size[1],
            ])
        np.savez(f'data_128\\{self.dataset_type}\\fault\\fault_{index}.npz', maskfile['arr_0'][
            zline: zline + cube_size[2],
            iline: iline + cube_size[0],
            xline: xline + cube_size[1],
        ])
    
    def _create_main_cubes(
        self,
        maskfile,
        seisfile,
    ):
        for i in range(self.dataset_size):
            print(i)
            iline, xline, zline = self._get_slice_coordinates(
                maskfile,
                self.threshold_value,
            )
            self._save_cube(i, iline, xline, zline, maskfile, seisfile)
    
    def _create_zeros_cubes(self):
        for index in range(int(self.dataset_size * 1.25), int(self.dataset_size * 1.5)):
            print(index)
            
            np.savez(
                f'data_128\\{self.dataset_type}\\seis\\seis_{index}.npz', 
                np.zeros((64,64,64)),
            )
            np.savez(
                f'data_128\\{self.dataset_type}\\fault\\fault_{index}.npz', 
                np.zeros((64,64,64)),
            )
    
    def _create_empty_cubes(
        self,
        maskfile,
        seisfile,
    ):
        for i in range(self.dataset_size, int(self.dataset_size * 1.25)):
            print(i)
            iline, xline, zline = self._get_slice_coordinates(
                maskfile,
                threshold_value=10,
            )
            self._save_cube(i, iline, xline, zline, maskfile, seisfile)
    
    def create_dataset(self) -> str:
        """
        Create temp folder with dataset cubes.

        Returns: path to dataset
        """
        base_path = '..\\..\\'
        if not os.path.isdir(base_path + 'data_128'):
            os.makedirs(base_path + 'data_128')

        with np.load(self.mask_filepath, mmap_mode='r') as maskfile, \
            np.load(self.seis_filepath, mmap_mode='r') as seisfile:
            self._create_main_cubes(maskfile, seisfile)
            self._create_empty_cubes(maskfile, seisfile)
            # self._create_zeros_cubes()
            print('Finding end')

In [8]:
seis_path = 'data/seistrain4.npz'
fault_mask_path = 'data/faulttrain4.npz'
batch_size=1

In [4]:
with np.load(seis_path, mmap_mode='r+') as seis_file:
    print(seis_file['arr_0'].shape)

(100, 3174, 1537)


In [5]:
1+1

2

In [12]:
DataPreparation(
    seis_filepath=seis_path,
    mask_filepath=fault_mask_path,
    dataset_size=5,
    cube_size=(128, 64, 32),
    dataset_type='test',
    threshold_percent=0.1,
).create_dataset()

0
1
2
Very high density value. Reduce by 10% 26214.4 0.1
3
4


KeyboardInterrupt: 

In [60]:
DataPreparation(
    seis_filepath=seis_path,
    mask_filepath=fault_mask_path,
    dataset_size=20,
    cube_size=(128, 64, 32),
    dataset_type='eval',
    threshold_percent=0.1,
).create_dataset()

20
21
22
23
24
Finding end


In [None]:
train_dataset = SegyDataset(
    segy_filepath=seis_path,
    fault_filepath=fault_mask_path,
    fault_coords_filepath=coords_mask_path,
)
eval_dataset = SegyDataset(
    segy_filepath=seis_path,
    fault_filepath=fault_mask_path,
    fault_coords_filepath=coords_mask_path,
)
trainloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    # num_workers=8,
)
evalloader = DataLoader(
    eval_dataset,
    batch_size=batch_size,
    # num_workers=8,
)

In [18]:
cube_size= 64
def _get_slice_coordinates(
    faultfile,
) -> tuple[int, int, int]:
    """Find сoordinates of cube by threshold.
    Get random XYZ point and get cube based on point and cube_size
    Check amount fault pixel and return coordinate if it contains equal or
    higher than threshold
    """
    count = 0
    print(faultfile['arr_0'].shape)
    threshold_value = ((64**3) * 0.1)
    IL, XL, Z = faultfile['arr_0'].shape
    while True:
        count += 1
        iline = randint(0, IL - cube_size)
        xline = randint(0, XL - cube_size)
        zline = randint(0, Z - cube_size)
        fault_slice = faultfile['arr_0'][
            iline: iline + cube_size,
            xline: xline + cube_size,
            zline: zline + cube_size
        ]
        if fault_slice.sum() > ((64**3) * 0.1):
            return iline, xline, zline
        if count >= 40:
            print(
                'Very high density value. Reduce by 10%',
                threshold_value,
                round(threshold_value/(cube_size ** 3), 3)
            )
            threshold_value = int(0.9 * threshold_value)


with np.load('data/faulttrain1.npz', mmap_mode='r') as maskfile, \
    np.load('data/seistrain1.npz', mmap_mode='r') as seisfile:
    for i in range(2):
        print(i)
        iline, xline, zline = _get_slice_coordinates(maskfile)
        np.savez(f'data\\train\\seis\\seis_{i}.npz', seisfile['arr_0'][
            iline: iline + cube_size,
            xline: xline + cube_size,
            zline: zline + cube_size
        ])
        np.savez(f'data\\train\\fault\\fault_{i}.npz', maskfile['arr_0'][
            iline: iline + cube_size,
            xline: xline + cube_size,
            zline: zline + cube_size
        ])

0
(100, 3174, 1537)
[[[ 0.12092286 -0.02960199 -0.16105449 ...  0.19605428  0.17255622
    0.07053685]
  [ 0.16239929  0.00104181 -0.16283095 ...  0.22003525  0.22493064
    0.10861802]
  [ 0.2269299   0.09026837 -0.10260248 ...  0.20660025  0.22827423
    0.11697114]
  ...
  [-0.19143921 -0.04417092  0.06910896 ... -0.08293355 -0.08418304
   -0.02764914]
  [-0.20026201 -0.07672679  0.01140019 ... -0.07971835 -0.07892215
   -0.03095816]
  [-0.19411951 -0.04312687  0.03207748 ... -0.03399706 -0.05147021
   -0.02329966]]

 [[ 0.08639401 -0.00497409 -0.10770768 ...  0.15997791  0.17513859
    0.09571868]
  [ 0.12146825  0.01541093 -0.11080289 ...  0.20167327  0.2182762
    0.10920864]
  [ 0.17860806  0.06749946 -0.10580873 ...  0.21828514  0.2479831
    0.1302591 ]
  ...
  [-0.16161895 -0.0374183   0.03241472 ...  0.05851654  0.04895186
    0.02120948]
  [-0.23717839 -0.12829393 -0.02147569 ...  0.01768734  0.00581464
    0.00531198]
  [-0.2814405  -0.15717071 -0.01807653 ... -0.00513782 

In [39]:
os.makedirs('data_1/train/seis')

FileExistsError: [WinError 183] Невозможно создать файл, так как он уже существует: 'data_1/train/seis'

In [None]:
os.makedirs('data_1/train/seis')