<a href="https://colab.research.google.com/github/abaranguer/uoc_tfm/blob/main/TFM_Albert_Baranguer_Codina.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TFM Albert Baranguer i Codina
# Entrenament d'una xarxa ResNet18 per a la classificació del dataset HAM10000

In [None]:


from google.colab import drive
drive.mount("/content/drive")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install colab_ssh --upgrade --quiet

from colab_ssh import launch_ssh_cloudflared, init_git_cloudflared

password='PasswordCloudflaredTfm202122'
launch_ssh_cloudflared(password)

git_repo = 'https://github.com/abaranguer/uoc_tfm'

init_git_cloudflared(repository_url=git_repo + ".git",
         personal_token="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", 
         branch="main",
         email="abaranguer@gmail.com",
         username="abaranguer")

In [None]:
# dx (classes) - classe to int and viceversa

dx_to_int = {
    'akiec': 0,
    'bcc': 1,
    'bkl': 2,
    'df': 3,
    'nv': 4,
    'mel': 5,
    'vasc': 6
}

int_to_dx = [
     'akiec',
     'bcc',
     'bkl',
     'df',
     'nv',
     'mel',
     'vasc'
]

dx_to_description = {
    'akiec': 'Actinic Keratoses and Intraepithelial Carcinoma',
    'bcc': 'Basal cell carcinoma',
    'bkl': '"Benign keratosis"',
    'df': 'Dermatofibroma',
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'vasc': 'Vascular skin lesions'
}


In [None]:
# dataset analyzer

import pandas
import time


class Ham10000DatasetAnalyzer:
    def __init__(self):
        self.path = None
        self.df = None
        self.num_of_images = 0
        self.dataset_classes = 0
        self.dataset_classes_counts = None

    def analyze_path(self, path):
        self.path = path
        self.df = pandas.read_csv(path)
        self.analyze()

    def analyze_dataframe(self, df):
        self.path = None
        self.df = df
        self.analyze()

    def analyze(self):
        self.num_of_images = len(self.df['dx'])
        self.dataset_classes = self.df['dx'].unique()
        self.dataset_classes_counts = self.df['dx'].value_counts()

    def metadata(self):
        return self.num_of_images, self.dataset_classes, self.dataset_classes_counts

    def show(self, title):
        print(f'---- Analyzer. {title} ----\n')
        print(f'num of images: {self.num_of_images}')
        print(f'num of classes: {self.dataset_classes}')
        for dataset_classe_count in enumerate(self.dataset_classes_counts):
            print(
                f'\tclasse: "{self.dataset_classes[dataset_classe_count[0]]}"; num of images: {dataset_classe_count[1]};{(100.0 * dataset_classe_count[1] / self.num_of_images): .2f} % of the dataset.')
        print('------------------------')

    def save_dataframe(self, data_frame, filename):
        path = '/content/drive/MyDrive/UOC-TFM/dataframes/'
        timestamp = time.strftime("%Y%m%d%H%M%S")
        filename = path + timestamp + '_' + filename
        data_frame.to_pickle(filename)

In [None]:
# dataset splitter

import numpy as np
import pandas
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision import transforms

class Ham10000DatasetSplitter:
    def __init__(self, dataset_metadata_path, dataset_images_path,
                 percent_val=0.15, percent_test=0.15,
                 BATCH_SIZE=100, VAL_BATCH_SIZE=20, TEST_BATCH_SIZE=20):
        np.random.seed(0)
        analyzer = Ham10000DatasetAnalyzer()
        analyzer.analyze_path(dataset_metadata_path)
        analyzer.show('FULL DATASET')

        df = pandas.read_csv(dataset_metadata_path)
        percent_validation = percent_val + percent_test
        self.train_set, val_test_set = train_test_split(df, test_size=percent_validation)
        percent_test_validation = percent_test / percent_validation
        self.validation_set, self.test_set = train_test_split(val_test_set, test_size=percent_test_validation)

        analyzer.analyze_dataframe(self.train_set)
        analyzer.show('TRAIN SET')
        analyzer.save_dataframe(self.train_set, 'dataframe_train_set.pkl')

        analyzer.analyze_dataframe(self.validation_set)
        analyzer.show('VALIDATION SET')
        analyzer.save_dataframe(self.train_set, 'dataframe_validation_set.pkl')

        analyzer.analyze_dataframe(self.test_set)
        analyzer.show('TEST SET')
        analyzer.save_dataframe(self.train_set, 'dataframe_test_set.pkl')

        self.data_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

        '''
        '# training data
        train_data_transform = transforms.Compose([
            transforms.Resize(224),
            transforms.RandomHorizontalFlip(),
            transforms.RandomVerticalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])
        '''

        self.train_dataset = Ham10000Dataset(self.train_set, dataset_images_path, self.data_transform)
        self.validation_dataset = Ham10000Dataset(self.validation_set, dataset_images_path, self.data_transform)
        self.test_dataset = Ham10000Dataset(self.test_set, dataset_images_path, self.data_transform)

        self.train_dataloader = DataLoader(
            self.train_dataset,
            batch_size=BATCH_SIZE,
            shuffle=True
        )

        self.validation_dataloader = DataLoader(
            self.validation_dataset,
            batch_size=VAL_BATCH_SIZE,
            shuffle=True
        )

        self.test_dataloader = DataLoader(
            self.test_dataset,
            batch_size=TEST_BATCH_SIZE,
            shuffle=True
        )

In [None]:
# HAM10000 Dataset
 
import matplotlib.pyplot as plt
import numpy as np
import pandas
import torchvision
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms


class Ham10000Dataset(Dataset):
    def __init__(self, csv, img_folder, transform):
        self.csv = csv
        self.transform = transform
        self.img_folder = img_folder
        self.image_names = self.csv[:]['image_id']
        self.labels = np.array(
            self.csv.drop(['lesion_id', 'dx_type', 'age', 'sex', 'localization', 'dataset'], axis=1))

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, index):
        img_path = self.img_folder + self.image_names.iloc[index] + '.jpg'
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        targets = self.labels[index]
        return {'image': image,
                'image_id': targets[0],
                'dx': targets[1],
                'label': dx_to_int[targets[1]]}


In [None]:
#Resnet18 trainer

import time
import torch.optim
import torchvision.models as models
from torch.nn import CrossEntropyLoss
from torch.optim import SGD


class Ham10000ResNet18Trainer:

    def __init__(self, train_dataloader, model, epochs=5):
        self.train_dataloader = train_dataloader
        self.model = model
        self.epochs = epochs
        self.loss = None
        self.optimizer = None
        self.which_device = ""

    def run_training(self):
        self.loss = CrossEntropyLoss()
        self.optimizer = SGD(self.model.parameters(), lr=0.001, momentum=0.9)

        # select device (GPU or CPU)
        self.which_device = "cuda:0" if torch.cuda.is_available() else "cpu"
        print(f'using {self.which_device} device')
        device = torch.device(self.which_device)

        for epoch in range(self.epochs):  # loop over the dataset multiple times
            running_loss = 0.0

            for i, images in enumerate(self.train_dataloader, 0):
                inputs = images['image']
                labels = images['label']

                self.optimizer.zero_grad()

                outputs = self.model(inputs)
                loss_current = self.loss(outputs, labels)
                loss_current.backward()
                self.optimizer.step()

                running_loss += loss_current.item()
                print(f'epoch: {epoch}; i : {i}')
                if i % 100 == 99:  # print every 100 mini-batches
                    print('[%d, %5d] loss: %.3f' %
                          (epoch + 1, i + 1, running_loss / 100))
                    running_loss = 0.0

        print('Finished Training')

        timestamp = time.strftime("%Y%m%d%H%M%S")
        path = '/content/drive/MyDrive/UOC-TFM/resnet18_parameters/'
        trained_model_filename = path + timestamp + '_ham10000_trained_model.pth'
        torch.save(self.model.state_dict(), trained_model_filename)


In [None]:
#resnet18 predictor

import numpy as np
import pandas
import torch.optim
import torchvision.models as models
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision import transforms

class Ham10000ResNet18Predictor:
    def __init__(self, model, test_dataloader):
        self.model = model
        self.test_dataloader = test_dataloader

    def run_predictor(self):
        images = next(iter(self.test_dataloader))

        with torch.no_grad():
            images_as_tensors = images['image']
            outputs = model(images_as_tensors)
            _, predicted = torch.max(outputs, 1)

        print('Predicted: ', ' '.join('%5s' % int_to_dx[int(predicted[j])] for j in range(len(predicted))))


In [None]:
#resnet18 validator

import numpy as np
import pandas
import torch.optim
import torchvision
import torchvision.models as models
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from torchvision import transforms

class Ham10000ResNet18Validator:
    def __init__(self, model, validation_dataloader):
        self.model = model
        self.validation_dataloader = validation_dataloader
        self.accuracy = 0.0

    def run_validation(self):
        correct = 0
        total = 0

        for i, images in enumerate(self.validation_dataloader, 0):
            inputs = images['image']
            labels = images['label']

            print(f'batch {i}')

            with torch.no_grad():
                outputs = self.model(inputs)

                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        self.accuracy = 100 * correct / total
        print(f'num of correct predicted images (True positives): {correct}')
        print(f'num of images : {total}')
        print(f'Accuracy of the network on the test images: {self.accuracy: .4f}%')


In [None]:
import time

def log_time(message):
    start_time = time.strftime("%Y%m%d - %H%M%S")
    print(f'{message} {start_time}')

### Resnet18 Training Pipeline

In [None]:
matadata_path = '/content/drive/MyDrive/UOC-TFM/dataset/HAM10000_metadata'
images_path = '/content/drive/MyDrive/UOC-TFM/dataset/dataset_ham_10000/ham10000/300x225/'

print('1 . Splits training, validation and test sets')
splitter = Ham10000DatasetSplitter(matadata_path, images_path)
train_dataloader = splitter.train_dataloader
validation_dataloader = splitter.validation_dataloader
test_dataloader = splitter.test_dataloader


1 . Splits training, validation and test sets
---- Analyzer. FULL DATASET ----

num of images: 10015
num of classes: ['bkl' 'nv' 'df' 'mel' 'vasc' 'bcc' 'akiec']
	classe: "bkl"; num of images: 6705; 66.95 % of the dataset.
	classe: "nv"; num of images: 1113; 11.11 % of the dataset.
	classe: "df"; num of images: 1099; 10.97 % of the dataset.
	classe: "mel"; num of images: 514; 5.13 % of the dataset.
	classe: "vasc"; num of images: 327; 3.27 % of the dataset.
	classe: "bcc"; num of images: 142; 1.42 % of the dataset.
	classe: "akiec"; num of images: 115; 1.15 % of the dataset.
------------------------
---- Analyzer. TRAIN SET ----

num of images: 7010
num of classes: ['nv' 'bkl' 'bcc' 'akiec' 'vasc' 'df' 'mel']
	classe: "nv"; num of images: 4693; 66.95 % of the dataset.
	classe: "bkl"; num of images: 784; 11.18 % of the dataset.
	classe: "bcc"; num of images: 775; 11.06 % of the dataset.
	classe: "akiec"; num of images: 350; 4.99 % of the dataset.
	classe: "vasc"; num of images: 234; 3.3

In [None]:
print('2 - create ResNet18 model')
model = models.resnet18()

2 - create ResNet18 model


In [None]:
print('3 - train model')
trainer = Ham10000ResNet18Trainer(train_dataloader, model)

log_time('\tTraining start time:')

trainer.run_training()

log_time('\tTraining end time:')

3 - train model
	Training start time: 20211018 - 202332
using cpu device


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


epoch: 0; i : 0
epoch: 0; i : 1
epoch: 0; i : 2
epoch: 0; i : 3
epoch: 0; i : 4
epoch: 0; i : 5
epoch: 0; i : 6
epoch: 0; i : 7
epoch: 0; i : 8
epoch: 0; i : 9
epoch: 0; i : 10
epoch: 0; i : 11
epoch: 0; i : 12
epoch: 0; i : 13
epoch: 0; i : 14
epoch: 0; i : 15
epoch: 0; i : 16
epoch: 0; i : 17
epoch: 0; i : 18
epoch: 0; i : 19
epoch: 0; i : 20
epoch: 0; i : 21
epoch: 0; i : 22
epoch: 0; i : 23
epoch: 0; i : 24
epoch: 0; i : 25
epoch: 0; i : 26
epoch: 0; i : 27
epoch: 0; i : 28
epoch: 0; i : 29
epoch: 0; i : 30
epoch: 0; i : 31
epoch: 0; i : 32
epoch: 0; i : 33
epoch: 0; i : 34
epoch: 0; i : 35
epoch: 0; i : 36
epoch: 0; i : 37
epoch: 0; i : 38
epoch: 0; i : 39
epoch: 0; i : 40
epoch: 0; i : 41
epoch: 0; i : 42
epoch: 0; i : 43
epoch: 0; i : 44
epoch: 0; i : 45
epoch: 0; i : 46
epoch: 0; i : 47
epoch: 0; i : 48
epoch: 0; i : 49
epoch: 0; i : 50
epoch: 0; i : 51
epoch: 0; i : 52
epoch: 0; i : 53
epoch: 0; i : 54
epoch: 0; i : 55
epoch: 0; i : 56
epoch: 0; i : 57
epoch: 0; i : 58
epoch: 

És dir, amb 5 epochs, el training ha trigat unes 3h i mitja (23h 58m 28s - 20h 23h 32s).

En aquell moment, l'ordinador estava desatès, de forma que la sessió va caducar per inactivitat.

Tanmateix, el training i els dataframes estan desats al Google Drive.

Per tant, els passos següents els realitzo recuperant el traning i els dataframes del Google Drive.

In [None]:
print('4 - validate model')
validator = Ham10000ResNet18Validator(model, validation_dataloader)
validator.run_validation()

In [None]:
print('5 - make predictions')
predictor = Ham10000ResNet18Predictor(model, test_dataloader)
predictor.run_predictor()