In [None]:
!pip install -U ultralytics wandb

In [None]:
import random
import shutil
import torch
import yaml
import os

import numpy as np

from pathlib import Path

from ultralytics import YOLO

from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
shutil.copy2('/content/drive/MyDrive/ITaS/data/best.pt', 'best.pt')

In [None]:
import wandb
from wandb.integration.ultralytics import add_wandb_callback

wandb.login(key='1de7addec37341330a8aef9bfc7be382cc9c2824')

In [None]:
def create_yaml(output_yaml_path, train_image_dir, val_image_dir, test_image_dir=' ', nc=10):

    names = ['lungs', 'trachea', 'bronchitis', 'pneumonia/bronchopneumonia', 'pulmonary edema',
             'hydrothorax', 'pneumothorax', 'tracheal collapse', 'neoplasm', 'atelectasis']

    yaml_data = {
        'names': names,
        'nc': nc,
        'train': train_image_dir,
        'val': val_image_dir,
        'test': test_image_dir
    }

    with open(output_yaml_path, 'w') as j:
        yaml.dump(yaml_data, j, default_flow_style=False)

In [None]:
def calculation_entropy_sampling_dataset(
    confidence_threshold: float = 0.01
):
    path_to_dataset = '/content/drive/MyDrive/ITaS/data/full_train'

    # train_labels_paths не создаем, так как пройдя через алгоритм Least Confidence
    # мы получим урезанный список картинок, которму train_labels_paths не будет соответствовать.
    # Легче потом просто использовать str.replace(), так как наименование файлов одинаковое.
    train_images_paths = []

    val_images_paths = []
    val_labels_paths = []

    # Проходимся по директории полного датасета
    for root, _, files in os.walk(path_to_dataset):
        for file in files:
            # train часть
            if r'\train\images' in root:
                train_images_paths.append(os.path.join(root, file))
            # val часть
            elif r'\val\images' in root:
                val_images_paths.append(os.path.join(root, file))
            elif r'\val\labels' in root:
                val_labels_paths.append(os.path.join(root, file))

    # Алгоритм Least Confidence
    model = YOLO('best.pt')

    selected_train_images_samples = []
    for i, image_path in enumerate(train_images_paths):
        print(f'{i}/{len(train_images_paths)} ->> {image_path}')
        results = model.predict(image_path, conf=confidence_threshold)
        result = results[0]

        classes_probabilities = result.boxes.conf

        # Вычисляем энтропию для текущего изображения
        entropy = -np.sum(classes_probabilities.cpu().numpy() * np.log(classes_probabilities.cpu().numpy() + 1e-10))

        selected_train_images_samples.append({
            'image_path': image_path,
            'entropy': entropy
        })

    # Сортируем по приоритету
    selected_train_images_samples.sort(key=lambda x: x['entropy'])

    # Вычленяем только пути у сортированного словаря
    selected_train_images_samples = [image['image_path'] for image in selected_train_images_samples]
    selected_train_labels_samples = [path.replace('images', 'labels').replace('.jpg', '.txt') for path in selected_train_images_samples]

    return (selected_train_images_samples, selected_train_labels_samples, val_images_paths, val_labels_paths)

In [None]:
entropy_sampling_full_dataset = calculation_entropy_sampling_dataset()
print(len(entropy_sampling_full_dataset[0]))

In [None]:
def create_entropy_sampling_dataset(
    train_images_path: list,
    train_labels_path: list,
    val_images_path: list,
    val_labels_path: list,
    experiment_name: str,
    percentage_from_top: int = None,
    alg_name: str = 'entropy_sampling'
):
    train_images_path = train_images_path[:int(len(train_images_path) * (percentage_from_top / 100))]
    train_labels_path = train_labels_path[:int(len(train_labels_path) * (percentage_from_top / 100))]

    train_images_dir = os.path.join('data', alg_name, experiment_name, 'train', 'images')
    val_images_dir = os.path.join('data', alg_name, experiment_name, 'val', 'images')
    train_labels_dir = os.path.join('data', alg_name, experiment_name, 'train', 'labels')
    val_labels_dir = os.path.join('data', alg_name, experiment_name, 'val', 'labels')

    os.makedirs(train_images_dir, exist_ok=True)
    os.makedirs(val_images_dir, exist_ok=True)
    os.makedirs(train_labels_dir, exist_ok=True)
    os.makedirs(val_labels_dir, exist_ok=True)

    for train_image_path, train_label_path in zip(train_images_path, train_labels_path):
        shutil.copy2(train_image_path,
                      os.path.join(
                          'data', alg_name, experiment_name, 'train', 'images',
                          train_image_path[train_image_path.find('images') + 7:]))

        shutil.copy2(train_label_path,
                      os.path.join(
                          'data', alg_name, experiment_name, 'train', 'labels',
                          train_label_path[train_label_path.find('labels') + 7:]))

    for val_image_path, val_label_path in zip(val_images_path, val_labels_path):

        shutil.copy2(val_image_path,
                      os.path.join(
                          'data', alg_name, experiment_name, 'val', 'images',
                          val_image_path[val_image_path.find('images') + 7:]))

        shutil.copy2(val_label_path,
                      os.path.join(
                          'data', alg_name, experiment_name, 'val', 'labels',
                          val_label_path[val_label_path.find('labels') + 7:]))

    yaml_path = os.path.join('data', alg_name, experiment_name, 'data.yaml')
    train_path = os.path.join('train', 'images')
    val_path = os.path.join('val', 'images')
    create_yaml(yaml_path, train_path, val_path)

In [None]:
create_entropy_sampling_dataset(
    train_images_path=entropy_sampling_full_dataset[0],     # train_images
    train_labels_path=entropy_sampling_full_dataset[1],     # train_labels
    val_images_path=entropy_sampling_full_dataset[2],       # val_images
    val_labels_path=entropy_sampling_full_dataset[3],       # val_samples
    percentage_from_top=1,
    experiment_name='1_train'
)

In [None]:
create_entropy_sampling_dataset(
    train_images_path=entropy_sampling_full_dataset[0],     # train_images
    train_labels_path=entropy_sampling_full_dataset[1],     # train_labels
    val_images_path=entropy_sampling_full_dataset[2],       # val_images
    val_labels_path=entropy_sampling_full_dataset[3],       # val_samples
    percentage_from_top=10,
    experiment_name='10_train'
)

In [None]:
create_entropy_sampling_dataset(
    train_images_path=entropy_sampling_full_dataset[0],     # train_images
    train_labels_path=entropy_sampling_full_dataset[1],     # train_labels
    val_images_path=entropy_sampling_full_dataset[2],       # val_images
    val_labels_path=entropy_sampling_full_dataset[3],       # val_samples
    percentage_from_top=20,
    experiment_name='20_train'
)

In [None]:
def train_yolo_model(main_experiment: str, name: str, seed: int):
    wandb.init(project='ITaS', job_type='training')

    model = YOLO("yolov8m-seg.pt")

    results = model.train(
        data = os.path.join('/content/drive/MyDrive/ITaS' ,'data', main_experiment, name, 'data.yaml'),
        project = 'ITaS',
        name = name,
        epochs = 25,
        patience = 0,
        batch = 5,
        imgsz = 640,
        seed=seed
    )
    wandb.finish()

In [None]:
for i in range(5):
    train_yolo_model(main_experiment='entropy_sampling', name=f'1_train', seed=(i + 1) * 10)

In [None]:
for i in range(5):
    train_yolo_model(main_experiment='entropy_sampling', name=f'10_train', seed=(i + 1) * 11)

In [None]:
for i in range(5):
    train_yolo_model(main_experiment='entropy_sampling', name=f'20_train', seed=(i + 1) * 12)