In [1]:
import sys
sys.path.append("..")

In [3]:
import os
import shutil
import numpy as np

from fastai.vision.all import *
from pathlib import Path
from tqdm.auto import tqdm
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile

In [4]:
def download_and_unzip(url, extract_to='.'):
  """
  Takes an url to download and unzip the dataset
  """
  http_response = urlopen(url)
  zipfile = ZipFile(BytesIO(http_response.read()))
  zipfile.extractall(path=extract_to)


PROJ_PATH = Path.cwd().parent
DATA_PATH = PROJ_PATH/'data'
print(PROJ_PATH)
print(DATA_PATH)

/
/data


In [10]:
download_and_unzip(url='https://github.com/abin24/Magnetic-tile-defect-datasets./archive/refs/heads/master.zip',
                   extract_to=DATA_PATH)

extract_dir = DATA_PATH/'Magnetic-tile-defect-datasets.-master'

In [11]:
dataset_path = DATA_PATH/'MAGNETIC_TILE_SURFACE_DEFECTS'

if dataset_path.is_dir():
    shutil.rmtree(dataset_path)

if extract_dir.is_dir():
    shutil.move(extract_dir, dataset_path)
dataset_path.ls()


(#9) [Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Free'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Break'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/dataset.png'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Uneven'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Crack'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/README.md'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Fray'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/dataset.jpg'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Blowhole')]

In [12]:
dataset_path.ls()

(#9) [Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Free'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Break'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/dataset.png'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Uneven'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Crack'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/README.md'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Fray'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/dataset.jpg'),Path('/data/MAGNETIC_TILE_SURFACE_DEFECTS/MT_Blowhole')]

In [13]:
if (dataset_path/'MT_Free').exists():
    shutil.rmtree(dataset_path/'MT_Free')

In [14]:
classes = []

for ii in (dataset_path).ls():
    if ii.is_dir() and ii.stem.startswith('MT_'):
        classes.append(ii.stem)
classes

['MT_Break', 'MT_Uneven', 'MT_Crack', 'MT_Fray', 'MT_Blowhole']

In [15]:
classes_dict = {c:i+1 for i, c in enumerate(classes)}
classes_dict

{'MT_Break': 1, 'MT_Uneven': 2, 'MT_Crack': 3, 'MT_Fray': 4, 'MT_Blowhole': 5}

In [16]:
img_paths = [get_files(dataset_path/c, extensions='.jpg') for c in classes]
img_paths = [ii for sublist in img_paths for ii in sublist]
len(img_paths)

392

In [17]:
msk_paths = [get_files(dataset_path/c, extensions='.png') for c in classes]
msk_paths = [ii for sublist in msk_paths for ii in sublist]
len(msk_paths)

392

In [18]:
assert len(img_paths) == len(msk_paths)

img_dir_path = dataset_path/'images'
mask_dir_path = dataset_path/'masks'

for p in [img_dir_path, mask_dir_path]:
    p.mkdir(exist_ok=True)

In [19]:
np.random.seed(42)

for img_path, msk_path in tqdm(zip(img_paths, msk_paths), total=len(img_paths)):
    c = msk_path.parent.parent.stem
    msk = np.array(Image.open(msk_path))
    msk[msk>0] = 1 # binary segmenation: defect/defect-free

    new_img_path = img_dir_path/img_path.name
    new_mask_path = mask_dir_path/msk_path.name
    shutil.copyfile(img_path, new_img_path)
    Image.fromarray(msk).save(new_mask_path)

  0%|          | 0/392 [00:00<?, ?it/s]

In [20]:
assert len(get_files(img_dir_path, extensions='.jpg')) == len(get_files(mask_dir_path, extensions='.png'))

In [21]:
test_pct = 0.2

train_img_dir_path = dataset_path/'train_images'
train_mask_dir_path = dataset_path/'train_masks'
test_img_dir_path = dataset_path/'test_images'
test_mask_dir_path = dataset_path/'test_masks'

img_fpaths = get_files(img_dir_path, extensions='.jpg')
test_img_fpaths = random.sample(img_fpaths, int(test_pct*len(img_fpaths)))
train_img_fpaths = [fpath for fpath in img_fpaths if fpath not in test_img_fpaths]

for dir in [train_img_dir_path,
            train_mask_dir_path,
            test_img_dir_path,
            test_mask_dir_path]:
    dir.mkdir(exist_ok=True)

for img_fpath in test_img_fpaths:
    mask_fpath = mask_dir_path/f'{img_fpath.stem}.png'
    shutil.copy(img_fpath, test_img_dir_path)
    shutil.copy(mask_fpath, test_mask_dir_path)

for img_fpath in train_img_fpaths:
    mask_fpath = mask_dir_path/f'{img_fpath.stem}.png'
    shutil.copy(img_fpath, train_img_dir_path)
    shutil.copy(mask_fpath, train_mask_dir_path)


In [22]:
dirs_to_keep = [img_dir_path, mask_dir_path, train_img_dir_path, train_mask_dir_path, test_img_dir_path, test_mask_dir_path]

for ii in (dataset_path).ls():
    if ii not in dirs_to_keep:
        try: # if file
            os.remove(ii)
        except: # if directory
            shutil.rmtree(ii)