In [None]:
'''
conda create --name project python=3.9
pip install hugsvision=0.75.5
pip install ipykernel
pip install tensorboard
conda install pytorch=2.0.0 torchvision=0.15.0 torchaudio=2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia
pip install wrapt
pip install chardet
pip install --upgrade accelerate
'''

In [None]:
# https://github.com/qanastek/HugsVision/blob/main/recipes/kvasir_v2/binary_classification/Kvasir_v2_Image_Classifier.ipynb
import hugsvision
print(hugsvision.__version__)

In [None]:
from hugsvision.dataio.VisionDataset import VisionDataset

# test will be overwritten. Ratio of 0.00 so all images in train folder are used for training.
# hugsvision.dataio.VisionDataset.VisionDataset splitDatasets is editted (tabulation).
train, test, id2label, label2id = VisionDataset.fromImageFolder(
	"./archive/train/",
	test_ratio   = 0.00,
	balanced     = True,
	augmentation = True,
)

In [None]:
# Only overwrite test variable. Ratio of 1.00 unbalanced to ensure all images go to testing.
train2, test, id2label2, label2id2 = VisionDataset.fromImageFolder(
	"./archive/test/",
	test_ratio   = 1.00,
	balanced     = False,
	augmentation = True,
)

In [None]:
from hugsvision.nnet.VisionClassifierTrainer import VisionClassifierTrainer
from transformers import ViTFeatureExtractor, ViTForImageClassification

# Train downstream from previously trained model on kvasir v2 dataset.
#model_path = './out/MyKvasirV2Model/1_2023-02-01-13-17-42/model/'
model_path = 'google/vit-base-patch16-224-in21k'
trainer = VisionClassifierTrainer(
	model_name   = "model-ds",
	train      	 = train,
	test      	 = test,
	output_dir   = "./out/",
	max_epochs   = 1,      # 1 default
	batch_size   = 32,     # 32 default
	lr 		     = 2e-5,   # 2e-5 default
	fp16	     = False,  # False default
	model = ViTForImageClassification.from_pretrained(
	    model_path,
	    num_labels = len(label2id),
	    label2id   = label2id,
	    id2label   = id2label,
        ignore_mismatched_sizes=True
	),
	feature_extractor = ViTFeatureExtractor.from_pretrained(
		model_path,
	),
)

In [None]:
'''
hidden_size (`int`, *optional*, defaults to 768):
    Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (`int`, *optional*, defaults to 12):
    Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 12):
    Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (`int`, *optional*, defaults to 3072):
    Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
    The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
    `"relu"`, `"selu"` and `"gelu_new"` are supported.
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
    The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
    The dropout ratio for the attention probabilities.
initializer_range (`float`, *optional*, defaults to 0.02):
    The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
    The epsilon used by the layer normalization layers.
image_size (`int`, *optional*, defaults to `224`):
    The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to `16`):
    The size (resolution) of each patch.
num_channels (`int`, *optional*, defaults to `3`):
    The number of input channels.
qkv_bias (`bool`, *optional*, defaults to `True`):
    Whether to add a bias to the queries, keys and values.
encoder_stride (`int`, `optional`, defaults to 16):
    Factor to increase the spatial resolution by in the decoder head for masked image modeling.
'''
from transformers import ViTConfig

config = ViTConfig(
    hidden_size = 768,  		# default 768
	num_hidden_layers = 12,  	# default 12
	num_attention_heads = 4,  	# default 12
	intermediate_size = 3072,  	# default 3072
	hidden_act = "gelu",  		# default "gelu"
	hidden_dropout_prob = 0.0,  # default 0.0
	attention_probs_dropout_prob = 0.0,  # default 0.0
	initializer_range = 0.02,   # default 0.02
	layer_norm_eps = 1e-12,  	# default 1e-12
	image_size = 224,  			# default 224
	patch_size = 16,  			# default 16
	num_channels = 3,  			# default 3
	qkv_bias = True,  			# default True
	encoder_stride = 16,  		# default 16
	num_labels = len(label2id),
	label2id   = label2id,
	id2label   = id2label,
)

In [None]:
from hugsvision.nnet.VisionClassifierTrainer import VisionClassifierTrainer
from transformers import ViTFeatureExtractor, ViTForImageClassification, ViTConfig

# Train downstream from previously trained model on kvasir v2 dataset.
# kvasir_model_path = './out/MyKvasirV2Model/1_2023-02-01-13-17-42/model/'
trainer = VisionClassifierTrainer(
	model_name   = "model-s",
	train      	 = train,
	test      	 = test,
	output_dir   = "./out/",
	max_epochs   = 2,      # 1 default
	batch_size   = 32,     # 32 default
	lr 		     = 2e-4,   # 2e-5 default
	fp16	     = False,  # false default
	model = ViTForImageClassification(
		config
	),
	feature_extractor = ViTFeatureExtractor()
)

In [None]:
# Make a prediction.
from transformers import ViTFeatureExtractor, ViTForImageClassification
from hugsvision.inference.VisionClassifierInference import VisionClassifierInference

path = './out/model/1/model/'
img = './albatross.png'

classifier = VisionClassifierInference(
    feature_extractor = ViTFeatureExtractor.from_pretrained(path),
    model = ViTForImageClassification.from_pretrained(path),
)

label = classifier.predict(img_path=img)
print('Predicted class:', label)

In [None]:
hyp, ref = trainer.evaluate_f1_score()

In [None]:
# Loop through all images and test for accuracy.
import os
from tqdm import tqdm

count = 0
correct_count = 0
wrong_count = 0

# get the path/directory
test_dir = "./archive/valid/"
for species in tqdm(os.listdir(test_dir)):
    for image in os.listdir(os.path.join(test_dir, species)):
        img = os.path.join(test_dir, species, image)
        label = classifier.predict(img_path=img)
        count += 1
        if (label == species): correct_count += 1
        else: wrong_count +=1

In [None]:
print(f'Number correct = {correct_count}')
print(f'Number wrong = {wrong_count}')
print(f'Accuracy = {correct_count / count}')