In [1]:
import copy
import os
import random
import shutil
from collections import defaultdict
from urllib.request import urlretrieve

import albumentations as A

import cv2
import matplotlib.pyplot as plt
import torch
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.optim
import torchvision.models as models
from albumentations.pytorch import ToTensorV2
from clearml import Dataset as ClearMLDataset, Logger, OutputModel, Task
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [2]:
SEED = 666
IDX2LABEL = {
    0: 'n02111500-Great_Pyrenees',
    1: 'n02099712-Labrador_retriever',
    2: 'n02093754-Border_terrier',
    3: 'n02096294-Australian_terrier',
    4: 'n02088632-bluetick',
    5: 'n02104365-schipperke',
    6: 'n02108422-bull_mastiff',
    7: 'n02115641-dingo',
    8: 'n02108551-Tibetan_mastiff',
    9: 'n02096437-Dandie_Dinmont',
    10: 'n02108915-French_bulldog',
    11: 'n02102177-Welsh_springer_spaniel',
    12: 'n02092002-Scottish_deerhound',
    13: 'n02099601-golden_retriever',
    14: 'n02111277-Newfoundland',
    15: 'n02091134-whippet',
}
LABEL2IDX = {v: k for k, v in IDX2LABEL.items()}
NUM_CLASSES = len(IDX2LABEL)

In [3]:
class AnimalsDataset(Dataset):
    def __init__(self, images_filepaths, transform=None):
        self.images_filepaths = images_filepaths
        self.transform = transform

    def __len__(self):
        return len(self.images_filepaths)

    def __getitem__(self, idx):
        image_filepath = self.images_filepaths[idx]
        image = cv2.imread(image_filepath)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        label = LABEL2IDX[os.path.normpath(image_filepath).split(os.sep)[-2]]
        
        if self.transform is not None:
            image = self.transform(image=image)["image"]

        return image, label

In [4]:
def return_all_files_in_dir(path):
    
    result = [os.path.join(dp, f) for dp, dn, filenames in os.walk(path) for f in filenames if os.path.splitext(f)[1] == '.jpg']
    return result

In [5]:
def get_paths(data_path):
    paths = []
    class_labels = []
    for label in LABEL2IDX.keys():
        label_paths = [os.path.join(label, item) for item in os.listdir(os.path.join(data_path, label))]
        paths.extend(label_paths)
        class_labels.extend([label] * len(label_paths))
    return paths, class_labels


def split(paths, class_labels):
    train_paths, val_test_paths, _, val_test_labels = train_test_split(paths, class_labels, test_size=0.1,
                                                                       random_state=SEED)
    val_paths, test_paths, _, _ = train_test_split(val_test_paths, val_test_labels, test_size=0.5, random_state=SEED)
    return train_paths, val_paths, test_paths


def copy_files(data_path, new_data_path, paths, dataset_name, verbose = False):
    for path in tqdm(paths, total=len(paths), desc=f"Copying {dataset_name} files", disable=not verbose):
        shutil.copy(os.path.join(data_path, path), os.path.join(new_data_path, dataset_name, path))

### ========

In [6]:
task = Task.init(project_name = 'torch demo', task_name = 'train model and log')
output_model = OutputModel(task=task)

ClearML Task: created new task id=d0966bbe57c247bc92addb98acf5856b
ClearML results page: http://10.72.148.193:8080/projects/cdde80f225cb43d288c9078f180af237/experiments/d0966bbe57c247bc92addb98acf5856b/output/log
2022-11-10 18:45:33,122 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring


In [None]:
task.set_base_docker(docker_image='python:3.9', docker_arguments='--env http_proxy=http://sa0000mtsaimlplat:p%25G%21621AP%2BdbjH0@bproxy.pv.mts.ru:3128 --env https_proxy=http://sa0000mtsaimlplat:p%25G%21621AP%2BdbjH0@bproxy.pv.mts.ru:3128 --env no_proxy=localhost,127.0.0.1,localaddress,localdomain.com,10.72.148.193')
task.execute_remotely(queue_name='ai-gpu-docker')



In [None]:
raw_data = ClearMLDataset.get(dataset_name='animal-data')
local_copy = raw_data.get_local_copy()

In [None]:
paths, class_labels = get_paths(local_copy)
train_paths, val_paths, test_paths = split(paths, class_labels)

In [None]:
verbose = True

dataset_folder = 'dataset_split/'
os.makedirs(dataset_folder)

In [None]:
for dataset_name in ["train", "test", "val"]:
    for label in LABEL2IDX.keys():
        os.makedirs(os.path.join(dataset_folder, dataset_name, label), exist_ok=True)
        
for ds_paths, dataset_name in zip([train_paths, val_paths, test_paths], ["train", "val", "test"]):
    copy_files(local_copy, dataset_folder, ds_paths, dataset_name, verbose)

In [None]:
train_images_path = return_all_files_in_dir(os.path.join(dataset_folder, 'train/'))
val_images_path = return_all_files_in_dir(os.path.join(dataset_folder, 'val/'))

In [None]:
train_transform = A.Compose(
    [
        A.HorizontalFlip(p=0.5),
        A.GaussNoise(p=0.2),
        A.RandomBrightnessContrast(p=0.3),
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        A.Resize(224, 224),
        ToTensorV2(),
    ]
)
train_dataset = AnimalsDataset(images_filepaths=train_images_path, transform=train_transform)

In [None]:
val_transform = A.Compose(
    [
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        A.Resize(224, 224),
        ToTensorV2(),
    ]
)
val_dataset = AnimalsDataset(images_filepaths=val_images_path, transform=val_transform)

In [None]:
def calculate_accuracy(output, target):
    probs = torch.softmax(output, dim=1)
    classes = probs.argmax(dim=1)
    
    return torch.true_divide((target == classes).sum(dim=0), target.size(0)).item()

In [None]:
class MetricMonitor:
    def __init__(self, float_precision=3):
        self.float_precision = float_precision
        self.reset()

    def reset(self):
        self.metrics = defaultdict(lambda: {"val": 0, "count": 0, "avg": 0})

    def update(self, metric_name, val):
        metric = self.metrics[metric_name]

        metric["val"] += val
        metric["count"] += 1
        metric["avg"] = metric["val"] / metric["count"]
        
    def get(self, metric_name):
        return 

    def __str__(self):
        return " | ".join(
            [
                "{metric_name}: {avg:.{float_precision}f}".format(
                    metric_name=metric_name, avg=metric["avg"], float_precision=self.float_precision
                )
                for (metric_name, metric) in self.metrics.items()
            ]
        )

In [None]:
params = {
    "model": "resnet18",
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "lr": 0.001,
    "batch_size": 128,
    "num_workers": 0,
    "epochs": 3,
}

In [None]:
task.connect(params)

In [None]:
model = models.resnet18(pretrained=False)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, NUM_CLASSES)
model = model.to(params["device"])

criterion = nn.CrossEntropyLoss().to(params["device"])
optimizer = torch.optim.SGD(model.parameters(), lr=params["lr"])

In [None]:
train_data_loader = DataLoader(
    train_dataset, batch_size=params["batch_size"], shuffle=True, num_workers=params["num_workers"], pin_memory=True,
)
val_data_loader = DataLoader(
    val_dataset, batch_size=params["batch_size"], shuffle=False, num_workers=params["num_workers"], pin_memory=True,
)

In [None]:
def train(train_loader, model, criterion, optimizer, epoch, params):
    metric_monitor = MetricMonitor()
    model.train()
    stream = tqdm(train_loader)
    for i, (images, target) in enumerate(stream, start=1):
        images = images.to(params["device"], non_blocking=True)
        target = target.to(params["device"], non_blocking=True).long()
        output = model(images)
        loss = criterion(output, target)
        accuracy = calculate_accuracy(output, target)
        metric_monitor.update("Loss", loss.item())
        metric_monitor.update("Accuracy", accuracy)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        stream.set_description(
            "Epoch: {epoch}. Train. {metric_monitor}".format(epoch=epoch, metric_monitor=metric_monitor)
        )

In [None]:
def validate(val_loader, model, criterion, epoch, params):
    metric_monitor = MetricMonitor()
    model.eval()
    stream = tqdm(val_loader)
    
    with torch.no_grad():
        for i, (images, target) in enumerate(stream, start=1):
            images = images.to(params["device"], non_blocking=True)
            target = target.to(params["device"], non_blocking=True).long()
            output = model(images)
            loss = criterion(output, target)
            accuracy = calculate_accuracy(output, target)

            metric_monitor.update("Loss", loss.item())
            metric_monitor.update("Accuracy", accuracy)
            stream.set_description(
                "Epoch: {epoch}. Validation. {metric_monitor}".format(epoch=epoch, metric_monitor=metric_monitor)
            )
            
    Logger.current_logger().report_scalar(
        "val", "loss", iteration=epoch, value=metric_monitor.metrics['Loss']['val'])
    Logger.current_logger().report_scalar(
        "val", "accuracy", iteration=epoch, value=metric_monitor.metrics['Accuracy']['val'])

In [None]:
for epoch in range(1, params["epochs"] + 1):
    train(train_data_loader, model, criterion, optimizer, epoch, params)
    asd = validate(val_data_loader, model, criterion, epoch, params)

In [None]:
model_folder = 'model/'
os.makedirs(model_folder)
torch.save(model.state_dict(), os.path.join(model_folder, 'resnet18.pt'))

In [None]:
task.mark_completed()