In [273]:
import kfp
from kfp.components import func_to_container_op
from typing import NamedTuple

def download_data_from_minio(ip_address: str, port: int, access_key: str, secret_key: str, bucket_name: str, object_name: str):

    import sys
    import tarfile
    from minio import Minio
    import os


    client = Minio(
        f"{ip_address}:{port}",
        access_key=access_key,
        secret_key=secret_key,
        secure=False
    )
    found = client.bucket_exists(bucket_name)
    if found:
        print("Found the bucket")
        #Download data from MinIO bucket
        client.fget_object(
            bucket_name,
            object_name,
            object_name
        )
        print(f"Downloaded {object_name} from {bucket_name} bucket")
        # create a TarFile object for the specified file
        tar = tarfile.open(object_name)

        # extract all files in the tar file to the current working directory
        tar.extractall("/mnt/")
        directory_path = '/mnt/'

        # Use os.listdir() to get a list of all files and directories in the directory
        contents = os.listdir(directory_path)
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                print(os.path.join(root, file))

        # Print the list of contents
        print(contents)

        # close the TarFile object
        tar.close()
    else:
        print(f"{bucket_name} bucket does not exist.")
        sys.exit(1)



In [166]:
#download_data_from_minio("185.47.227.233", '9000',"Eschercloud_CV_demo","urF2FbUWv85dYtjB", "voc2012-bucket", "VOCtrainval_11-May-2012.tar")

Found the bucket
Downloaded VOCtrainval_11-May-2012.tar from voc2012-bucket bucket


PermissionError: [Errno 13] Permission denied: '/mnt/VOCdevkit'

In [274]:

def preprocess_voc_dataset(path: str) -> List[str]:
    from pathlib import Path
    import torchvision.transforms as T
    from torch.utils.data import Dataset, DataLoader
    from PIL import Image
    import os
    # Define augmentations
    transforms_list = [
        T.Compose([
            T.Resize((224, 224), interpolation=Image.NEAREST),

        ]),
        T.Compose([
           # T.RandomHorizontalFlip(p=1.0),
           # T.RandomRotation(degrees=(-180, 180)),
            T.Resize((224, 224), interpolation=Image.NEAREST),

        ])
    ]
    directory_path = '/mnt/'

    # Use os.listdir() to get a list of all files and directories in the directory
    contents = os.listdir(directory_path)

    # Print the list of contents
    print(contents)
    # Create preprocessed dataset directories
    versions = ['version1', 'version2']
    version_paths = [Path(path) / v for v in versions]
    for version_path in version_paths:
        (version_path / 'JPEGImages').mkdir(parents=True, exist_ok=True)
        (version_path / 'SegmentationClass').mkdir(parents=True, exist_ok=True)
    
    # Define dataset loader
    class VOC2012PreprocessedDataset(Dataset):
        def __init__(self, path: str, transform):
            self.path = path
            self.transform = transform
            
            self.imgs_path = sorted(list(Path(path).glob('JPEGImages/*.jpg')))
            self.masks_path = sorted(list(Path(path).glob('SegmentationClass/*.png')))
        
        def __len__(self):
            return len(self.imgs_path)
        
        def __getitem__(self, idx):
            img_path = self.imgs_path[idx]
            mask_path = self.masks_path[idx]
            
            img = Image.open(img_path).convert('RGB')
            mask = Image.open(mask_path).convert('P')
            
            if self.transform is not None:
                img = self.transform(img)
                mask = self.transform(mask)
            
            # Resize mask using nearest neighbor interpolation to preserve segmentation classes
            
            
            
            return img, mask
    transforms_list= [None,None]
    # Preprocess datasets
    directory_path = '/mnt/'

    # Use os.listdir() to get a list of all files and directories in the directory
    contents = os.listdir(directory_path)

    # Print the list of contents
    print(contents)
    for i, transform in enumerate(transforms_list):
        
        dataset = VOC2012PreprocessedDataset(path, transform)
        version_path = version_paths[i]
        for j in range(len(dataset)):
            print('-------->',j)
            img, mask = dataset[j]
            img_path = version_path / 'JPEGImages' / (f'{j:06}.jpg')
            mask_path = version_path / 'SegmentationClass' / (f'{j:06}.png')
            img.save(img_path)
            mask.save(mask_path)
            
    return [str(v) for v in version_paths]



In [175]:

from typing import NamedTuple
def load_voc2012_segmentation_dataset(root_dir: str) -> NamedTuple('Outputs', [('train_loader', DataLoader), ('val_loader', DataLoader)]):
    from PIL import Image
    import torch
    from torch.utils.data import Dataset, DataLoader
    from torchvision.transforms import ToTensor
    import os

    class VOC2012SegmentationDataset(Dataset):
        def __init__(self, root_dir, split='train', transform=None, resize_shape=(375, 500)):
            self.root_dir = root_dir
            self.split = split
            self.transform = transform
            self.resize_shape = resize_shape
            self.to_tensor = ToTensor()

            # read list of image IDs
            directory_path = '/mnt/'

            # Use os.listdir() to get a list of all files and directories in the directory
            contents = os.listdir(directory_path)

            # Print the list of contents
            print(contents)
            with open(os.path.join(root_dir, 'ImageSets', 'Segmentation', f'{split}.txt'), 'r') as f:
                self.image_ids = [line.strip() for line in f.readlines()]

        def __len__(self):
            return len(self.image_ids)

        def __getitem__(self, idx):
            # load image and label
            image_path = os.path.join(self.root_dir, 'JPEGImages', f'{self.image_ids[idx]}.jpg')
            label_path = os.path.join(self.root_dir, 'SegmentationClass', f'{self.image_ids[idx]}.png')
            image = Image.open(image_path).convert('RGB')
            label = Image.open(label_path).convert('L')

            # resize image and label
            image = image.resize(self.resize_shape)
            label = label.resize(self.resize_shape)

            # apply transform if specified
            if self.transform is not None:
                image, label = self.transform(image, label)

            # convert to PyTorch tensor
            image = self.to_tensor(image)
            label = self.to_tensor(label)

            return image, label

    train_dataset = VOC2012SegmentationDataset(root_dir=root_dir, split='train')
    val_dataset = VOC2012SegmentationDataset(root_dir=root_dir, split='val')
    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

    return (train_loader, val_loader)


In [144]:
load_voc2012_segmentation_dataset('VOCdevkit/VOC2012/')

(<torch.utils.data.dataloader.DataLoader at 0x7f7d858558b0>,
 <torch.utils.data.dataloader.DataLoader at 0x7f7d7d91d760>)

In [275]:
def write_to_volume():
    import os
    with open("/mnt/file.txt", "w") as file:
        file.write("Hello world")
    for root, dirs, files in os.walk("/mnt/"):
            for file in files:
                print(os.path.join(root, file))


In [276]:
def write_file_op():
    with open('/mnt/file.txt', 'w') as f:
        f.write('Hello world!')
        
def read_file_op():
    with open('/mnt/file.txt', 'r') as f:
        print(f.read())

In [277]:
import kfp
from kfp.components import create_component_from_func
from kfp import dsl
from typing import List
from kubernetes import client as k8s_client

# Define the components
#download_data_from_minio_op = create_component_from_func(download_data_from_minio, base_image='python:3.8', packages_to_install=['minio'])
#preprocess_voc_dataset_op = create_component_from_func(preprocess_voc_dataset, base_image='pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime', packages_to_install=['torchvision', 'pillow'])
#data_loader_op = create_component_from_func(load_voc2012_segmentation_dataset,output_component_file='load_voc2012_segmentation_dataset.yaml',base_image='pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime')
write_to_volume_op = create_component_from_func(write_to_volume, base_image='python:3.8')
read_from_volume_op = create_component_from_func(read_file_op, base_image='python:3.8')
@dsl.pipeline(
    name='VOC2012 Preprocessing Pipeline test',
    description='A pipeline that preprocesses VOC2012 dataset'
)
def voc2012_preprocessing_pipeline_test(
    ip_address: str,
    port: int,
    access_key: str,
    secret_key: str,
    bucket_name: str,
    object_name: str,
    local_path: str,
    root_dir: str
) -> List[str]:
    
    vop = dsl.VolumeOp(
        name="shared-volume",
        resource_name="shared-volume",
        modes=["ReadWriteMany"],
        size="1Gi"
    )
    
    # Define the pipeline
    download_task = download_data_from_minio_op(ip_address, port, access_key, secret_key, bucket_name, object_name).set_display_name('Download Data from Minio').add_pvolumes({"/mnt": vop.volume})
    #preprocess_task = preprocess_voc_dataset_op(root_dir).after(download_task).set_display_name('Preprocess VOC2012 Dataset').add_pvolumes({"/mnt'": download_task.pvolume})
    print_task = write_to_volume_op().after(download_task).set_display_name('printing data in mnt directory').add_pvolumes({"/mnt": vop.volume})
    #dataloader_task = data_loader_op(root_dir).after(download_task).set_display_name('Preparing data').add_pvolumes({"/mnt'": vop.volume})
        # Set cache option to CACHE_NONE
    #write_task = write_to_volume_op().set_display_name('write file').add_pvolumes({"/mnt": vop.volume})
    #read_task = read_from_volume_op().after(write_task).set_display_name('read file').add_pvolumes({"/mnt": vop.volume})
    return None

# Compile the pipeline
pipeline_func = voc2012_preprocessing_pipeline_test
pipeline_filename = pipeline_func.__name__ + '.yaml'
kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)

# Connect to the Kubeflow Pipeline and submit the pipeline for execution
client = kfp.Client()
experiment_name = 'Downloading data Pipeline'
experiment = client.create_experiment(experiment_name)
run_name = 'run'

arguments = {
    'ip_address': "185.47.227.233",
    'port': '9000',
    'access_key': "Eschercloud_CV_demo",
    'secret_key': "urF2FbUWv85dYtjB",
    'bucket_name': "voc2012-bucket",
    'object_name': "VOCtrainval_11-May-2012.tar",
    'local_path': "dataset/VOCtrainval_11-May-2012.tar",
    'root_dir': '/mnt/'
}

run = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments)


In [264]:
import kfp

# Connect to the Kubeflow Pipeline client
client = kfp.Client()

# Get a list of all experiments
experiments = client.list_experiments()
# Delete all runs in each experiment
if experiments.experiments:
    for experiment in experiments.experiments:
            print(experiment.id)
            client.delete_experiment(experiment.id)


164eff9c-c2f3-4f89-afd1-1eb3bf49e720


In [265]:

import kfp

# Connect to the Kubeflow Pipeline client
client = kfp.Client()
# Get a list of all experiments
runs = client.list_runs()
# Delete all runs in each experiment
if runs.runs:
    for run in runs.runs:
        print(run.id)
        kfp.Client().runs.delete_run(run.id)


95400384-d67d-421d-9187-4034ec69dcac


In [272]:
import kfp
import kfp.dsl as dsl
from kubernetes import client as k8s_client

@dsl.pipeline(name='Shared Volume Pipeline', description='A pipeline with two components sharing a volume')
def my_pipeline():
    # Define the shared volume
    vop = dsl.VolumeOp(
        name="shared-volume",
        resource_name="shared-volume",
        modes=["ReadWriteMany"],
        size="1Gi"
    )

    # First component to write the text file
    write_file = dsl.ContainerOp(
        name='write-file',
        image='ubuntu',
        command=['sh', '-c'],
        arguments=['echo "Hello World!" > /mnt/test.txt'],
        pvolumes={"/mnt": vop.volume}
    )

    # Second component to read and print the text file
    read_file = dsl.ContainerOp(
        name='read-file',
        image='ubuntu',
        command=['sh', '-c'],
        arguments=['cat /mnt/test.txt'],
        pvolumes={"/mnt": vop.volume}
    )

    # Print the output of the second component
    read_file.after(write_file)
    read_file.container.set_image_pull_policy("Always")
    read_file.execution_options.caching_strategy.max_cache_staleness = "P0D"

if __name__ == '__main__':
    kfp.compiler.Compiler().compile(my_pipeline, 'my_pipeline.yaml')
    client = kfp.Client()
    experiment = client.create_experiment('my_experiment')
    run = client.run_pipeline(experiment.id, 'my_run', 'my_pipeline.yaml')


In [271]:
!kubectl delete pod voc2012-preprocessing-pipeline-test-swzhp-3021110133

pod "voc2012-preprocessing-pipeline-test-swzhp-3021110133" deleted


In [None]:
!kubectl

In [5]:
from PIL import Image
import torch
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
import os
class VOC2012SegmentationDataset(Dataset):
    def __init__(self, root_dir, split='train', transform=None, resize_shape=(375, 500)):
        self.root_dir = root_dir
        self.split = split
        self.transform = transform
        self.resize_shape = resize_shape
        self.to_tensor = ToTensor()

        # read list of image IDs
        with open(os.path.join(root_dir, 'ImageSets', 'Segmentation', f'{split}.txt'), 'r') as f:
            self.image_ids = [line.strip() for line in f.readlines()]

    def __len__(self):
        return len(self.image_ids)

    def __getitem__(self, idx):
        # load image and label
        image_path = os.path.join(self.root_dir, 'JPEGImages', f'{self.image_ids[idx]}.jpg')
        label_path = os.path.join(self.root_dir, 'SegmentationClass', f'{self.image_ids[idx]}.png')
        image = Image.open(image_path).convert('RGB')
        label = Image.open(label_path).convert('L')

        # resize image and label
        image = image.resize(self.resize_shape)
        label = label.resize(self.resize_shape)

        # apply transform if specified
        if self.transform is not None:
            image, label = self.transform(image, label)

        # convert to PyTorch tensor
        image = self.to_tensor(image)
        label = self.to_tensor(label)

        return image, label


In [7]:
import kfp
from kfp.components import create_component_from_func
from kfp import dsl
from typing import NamedTuple

# Define download_data_from_minio component
download_data_from_minio_op = create_component_from_func(download_data_from_minio,
    base_image='python:3.9',
    packages_to_install=['minio']
)

# Define load_voc2012_segmentation_dataset component
load_voc2012_segmentation_dataset_op = create_component_from_func(load_voc2012_segmentation_dataset,
    base_image='pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime',
    packages_to_install=['torchvision', 'pillow']
)

# Define pipeline
@dsl.pipeline(name='VOC2012 Segmentation Dataset Pipeline', description='An example Kubeflow pipeline')
def voc2012_segmentation_dataset_pipeline(ip_address: str, port: int, access_key: str, secret_key: str, bucket_name: str, object_name: str, local_path: str, root_dir: str):
    # Define download_data_from_minio task
    download_data_from_minio_task = download_data_from_minio_op(ip_address, port, access_key, secret_key, bucket_name, object_name, local_path)

    # Define load_voc2012_segmentation_dataset task
    load_voc2012_segmentation_dataset_task = load_voc2012_segmentation_dataset_op(root_dir)

    # Set upstream dependency of load_voc2012_segmentation_dataset_task on download_data_from_minio_task
    load_voc2012_segmentation_dataset_task.after(download_data_from_minio_task)
    
    # Print the output of load_voc2012_segmentation_dataset_task
    print_op = dsl.ContainerOp(
        name='print',
        image='alpine',
        command=['echo', str(load_voc2012_segmentation_dataset_task.outputs)]
    )

# Compile pipeline
pipeline_func = voc2012_segmentation_dataset_pipeline
pipeline_filename = pipeline_func.__name__ + '.yaml'
kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)


In [None]:
import kfp
from kfp import dsl
from kfp.components import create_component_from_func

# Define pipeline input arguments
@dsl.pipeline(
    name='VOC2012 Segmentation Pipeline',
    description='A pipeline to download VOC2012 segmentation dataset and load it'
)
def voc2012_pipeline(
    ip_address: str,
    port: int,
    access_key: str,
    secret_key: str,
    bucket_name: str,
    object_name: str,
    local_path: str,
    root_dir: str
):
    # Define download_data_from_minio component
    download_op = create_component_from_func(download_data_from_minio, base_image='python:3.9',packages_to_install=['minio'])(
        ip_address,
        port,
        access_key,
        secret_key,
        bucket_name,
        object_name,
        local_path
    )

    # Define load_voc2012_segmentation_dataset component
    load_op = create_component_from_func(load_voc2012_segmentation_dataset, base_image='pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime', packages_to_install=['torchvision', 'pillow'] )(
        root_dir
    )

    # Connect the components
    load_op(download_op.outputs['local_path'])

# Define the pipeline's argument values
arguments = {
    'ip_address': "185.47.227.233",
    'port': '9000',
    'access_key': "Eschercloud_CV_demo",
    'secret_key': "urF2FbUWv85dYtjB",
    'bucket_name': "voc2012-bucket",
    'object_name': "VOCtrainval_11-May-2012.tar",
    'local_path': "dataset/VOCtrainval_11-May-2012.tar",
    'root_dir': 'VOCdevkit/VOC2012/'
}

"voc2012-bucket","VOCtrainval_11-May-2012.tar","dataset/VOCtrainval_11-May-2012.tar"

# Compile and run the pipeline
kfp.compiler.Compiler().compile(voc2012_pipeline, 'voc2012_pipeline.zip')
client = kfp.Client()
run = client.create_run_from_pipeline_func(voc2012_pipeline, arguments=arguments)


In [6]:
""" Parts of the U-Net model """

import torch
import torch.nn as nn
import torch.nn.functional as F


class DoubleConv(nn.Module):
    """(convolution => [BN] => ReLU) * 2"""

    def __init__(self, in_channels, out_channels, mid_channels=None):
        super().__init__()
        if not mid_channels:
            mid_channels = out_channels
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(mid_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.double_conv(x)


class Down(nn.Module):
    """Downscaling with maxpool then double conv"""

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.maxpool_conv = nn.Sequential(
            nn.MaxPool2d(2),
            DoubleConv(in_channels, out_channels)
        )

    def forward(self, x):
        return self.maxpool_conv(x)


class Up(nn.Module):
    """Upscaling then double conv"""

    def __init__(self, in_channels, out_channels, bilinear=True):
        super().__init__()

        # if bilinear, use the normal convolutions to reduce the number of channels
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
            self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
        else:
            self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
            self.conv = DoubleConv(in_channels, out_channels)

    def forward(self, x1, x2):
        x1 = self.up(x1)
        # input is CHW
        diffY = x2.size()[2] - x1.size()[2]
        diffX = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                        diffY // 2, diffY - diffY // 2])
        # if you have padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)


class OutConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(OutConv, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)

    def forward(self, x):
        return self.conv(x)

In [7]:
""" Full assembly of the parts to form the complete network """



class UNet(nn.Module):
    def __init__(self, n_channels, n_classes, bilinear=False):
        super(UNet, self).__init__()
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.bilinear = bilinear

        self.inc = (DoubleConv(n_channels, 64))
        self.down1 = (Down(64, 128))
        self.down2 = (Down(128, 256))
        self.down3 = (Down(256, 512))
        factor = 2 if bilinear else 1
        self.down4 = (Down(512, 1024 // factor))
        self.up1 = (Up(1024, 512 // factor, bilinear))
        self.up2 = (Up(512, 256 // factor, bilinear))
        self.up3 = (Up(256, 128 // factor, bilinear))
        self.up4 = (Up(128, 64, bilinear))
        self.outc = (OutConv(64, n_classes))

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5, x4)
        x = self.up2(x, x3)
        x = self.up3(x, x2)
        x = self.up4(x, x1)
        logits = self.outc(x)
        return logits

    def use_checkpointing(self):
        self.inc = torch.utils.checkpoint(self.inc)
        self.down1 = torch.utils.checkpoint(self.down1)
        self.down2 = torch.utils.checkpoint(self.down2)
        self.down3 = torch.utils.checkpoint(self.down3)
        self.down4 = torch.utils.checkpoint(self.down4)
        self.up1 = torch.utils.checkpoint(self.up1)
        self.up2 = torch.utils.checkpoint(self.up2)
        self.up3 = torch.utils.checkpoint(self.up3)
        self.up4 = torch.utils.checkpoint(self.up4)
        self.outc = torch.utils.checkpoint(self.outc)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = nn.CrossEntropyLoss()(output, target.squeeze(dim=1).long())
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += nn.CrossEntropyLoss()(output, target.squeeze(dim=1).long()).item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

def main():
    # set up device (GPU if available)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
 

    # set up data loaders
    train_dataset = VOC2012SegmentationDataset(root_dir='VOCdevkit/VOC2012/', split='train')
    val_dataset = VOC2012SegmentationDataset(root_dir='VOCdevkit/VOC2012/', split='val')
    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

    # set up model and optimizer
    model = UNet(n_channels=3, n_classes=21).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # train and test loop
    for epoch in range(1, 11):
        train(model, device, train_loader, optimizer, epoch)
#         test(model, device, val_loader)

    # save model
    torch.save(model.state_dict(), "unet_model.pt")

if __name__ == '__main__':
    main()




KeyboardInterrupt: 

In [None]:
@dsl.component
def prepare_data_op(data_path: str, train_val_split_ratio: float) -> NamedTuple('Outputs', [('train_data', str), ('val_data', str)]):
    import torch.utils.data as data
    from torchvision import transforms
    from data_loader import VOC2012SegmentationDataset
    
    # Define the data transforms
    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

In [22]:
import kfp
from kfp import dsl
from kfp.components import func_to_container_op
from kfp.components import InputPath, OutputPath

# Define the component for our training function
@func_to_container_op
def train_op(data_path: InputPath(str), model_path: OutputPath(str)):

    from PIL import Image
    import torch
    from torch.utils.data import Dataset
    from torchvision.transforms import ToTensor
    import os
    class VOC2012SegmentationDataset(Dataset):
        def __init__(self, root_dir, split='train', transform=None, resize_shape=(375, 500)):
            self.root_dir = root_dir
            self.split = split
            self.transform = transform
            self.resize_shape = resize_shape
            self.to_tensor = ToTensor()

            # read list of image IDs
            with open(os.path.join(root_dir, 'ImageSets', 'Segmentation', f'{split}.txt'), 'r') as f:
                self.image_ids = [line.strip() for line in f.readlines()]

        def __len__(self):
            return len(self.image_ids)

        def __getitem__(self, idx):
            # load image and label
            image_path = os.path.join(self.root_dir, 'JPEGImages', f'{self.image_ids[idx]}.jpg')
            label_path = os.path.join(self.root_dir, 'SegmentationClass', f'{self.image_ids[idx]}.png')
            image = Image.open(image_path).convert('RGB')
            label = Image.open(label_path).convert('L')

            # resize image and label
            image = image.resize(self.resize_shape)
            label = label.resize(self.resize_shape)

            # apply transform if specified
            if self.transform is not None:
                image, label = self.transform(image, label)

            # convert to PyTorch tensor
            image = self.to_tensor(image)
            label = self.to_tensor(label)

            return image, label
    
    """ Parts of the U-Net model """

    import torch
    import torch.nn as nn
    import torch.nn.functional as F


    class DoubleConv(nn.Module):
        """(convolution => [BN] => ReLU) * 2"""

        def __init__(self, in_channels, out_channels, mid_channels=None):
            super().__init__()
            if not mid_channels:
                mid_channels = out_channels
            self.double_conv = nn.Sequential(
                nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
                nn.BatchNorm2d(mid_channels),
                nn.ReLU(inplace=True),
                nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True)
            )

        def forward(self, x):
            return self.double_conv(x)


    class Down(nn.Module):
        """Downscaling with maxpool then double conv"""

        def __init__(self, in_channels, out_channels):
            super().__init__()
            self.maxpool_conv = nn.Sequential(
                nn.MaxPool2d(2),
                DoubleConv(in_channels, out_channels)
            )

        def forward(self, x):
            return self.maxpool_conv(x)


    class Up(nn.Module):
        """Upscaling then double conv"""

        def __init__(self, in_channels, out_channels, bilinear=True):
            super().__init__()

            # if bilinear, use the normal convolutions to reduce the number of channels
            if bilinear:
                self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
                self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
            else:
                self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
                self.conv = DoubleConv(in_channels, out_channels)

        def forward(self, x1, x2):
            x1 = self.up(x1)
            # input is CHW
            diffY = x2.size()[2] - x1.size()[2]
            diffX = x2.size()[3] - x1.size()[3]

            x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
                            diffY // 2, diffY - diffY // 2])
            # if you have padding issues, see
            # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
            # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
            x = torch.cat([x2, x1], dim=1)
            return self.conv(x)


    class OutConv(nn.Module):
        def __init__(self, in_channels, out_channels):
            super(OutConv, self).__init__()
            self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)

        def forward(self, x):
            return self.conv(x)


    """ Full assembly of the parts to form the complete network """



    class UNet(nn.Module):
        def __init__(self, n_channels, n_classes, bilinear=False):
            super(UNet, self).__init__()
            self.n_channels = n_channels
            self.n_classes = n_classes
            self.bilinear = bilinear

            self.inc = (DoubleConv(n_channels, 64))
            self.down1 = (Down(64, 128))
            self.down2 = (Down(128, 256))
            self.down3 = (Down(256, 512))
            factor = 2 if bilinear else 1
            self.down4 = (Down(512, 1024 // factor))
            self.up1 = (Up(1024, 512 // factor, bilinear))
            self.up2 = (Up(512, 256 // factor, bilinear))
            self.up3 = (Up(256, 128 // factor, bilinear))
            self.up4 = (Up(128, 64, bilinear))
            self.outc = (OutConv(64, n_classes))

        def forward(self, x):
            x1 = self.inc(x)
            x2 = self.down1(x1)
            x3 = self.down2(x2)
            x4 = self.down3(x3)
            x5 = self.down4(x4)
            x = self.up1(x5, x4)
            x = self.up2(x, x3)
            x = self.up3(x, x2)
            x = self.up4(x, x1)
            logits = self.outc(x)
            return logits

        def use_checkpointing(self):
            self.inc = torch.utils.checkpoint(self.inc)
            self.down1 = torch.utils.checkpoint(self.down1)
            self.down2 = torch.utils.checkpoint(self.down2)
            self.down3 = torch.utils.checkpoint(self.down3)
            self.down4 = torch.utils.checkpoint(self.down4)
            self.up1 = torch.utils.checkpoint(self.up1)
            self.up2 = torch.utils.checkpoint(self.up2)
            self.up3 = torch.utils.checkpoint(self.up3)
            self.up4 = torch.utils.checkpoint(self.up4)
            self.outc = torch.utils.checkpoint(self.outc)

    def train(model, device, train_loader, optimizer, epoch):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = nn.CrossEntropyLoss()(output, target.squeeze(dim=1).long())
            loss.backward()
            optimizer.step()
            if batch_idx % 10 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item()))
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import DataLoader
        # set up device (GPU if available)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

        # set up data loaders
    train_dataset = VOC2012SegmentationDataset(root_dir=data_path, split='train')
    val_dataset = VOC2012SegmentationDataset(root_dir=data_path, split='val')
    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)

    # set up model and optimizer
    model = UNet(n_channels=3, n_classes=21).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # train and test loop
    for epoch in range(1, 11):
        train(model, device, train_loader, optimizer, epoch)

    # save model
    torch.save(model.state_dict(), model_path)
# Define the pipeline
@dsl.pipeline(
    name='UNet Training Pipeline',
    description='Pipeline to train UNet for image segmentation'
)
def unet_training_pipeline(data_path: str, model_path: str):
    train = train_op(data_path)

# Compile the pipeline
pipeline_func = unet_training_pipeline
pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'
kfp.compiler.Compiler().compile(pipeline_func, pipeline_filename)

# Connect to the Kubeflow Pipeline and submit the pipeline for execution
client = kfp.Client()
experiment_name = 'UNet Training Pipeline'
experiment = client.create_experiment(experiment_name)
run_name = pipeline_func.__name__ + ' run'
run = client.run_pipeline(experiment.id, run_name, pipeline_filename, params={
    'data_path': 'VOCdevkit/VOC2012/',
    'model_path': 'models/unet_model.pt'
})
