In [1]:
# The MIT License (MIT)
#
# Copyright (c) 2026 Alexander Zhura
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

In [2]:
import cv2 as cv
import random
import torch

from pathlib import Path
from typing import Tuple
from PIL import Image
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import v2

In [3]:
MODEL_CLASSES = 'ЁАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
MODEL_N_CHANNELS = 3 # RGB
MODEL_N_CLASSES = len(MODEL_CLASSES)
MODEL_HIDDEN_UNITS = 96
MODEL_FILEPATH = Path('TinyVGG_P20_H96_64x64.pth')
MODEL_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL_DATASET_PATH = Path('dataset/test/')

DATALOADER_BATCH_SIZE = 32
DATALOADER_TRAIN_SIZE = 0.20

TRANSFORM_MARGIN_SIZE = 32
TRANSFORM_DATA_SIZE = (64, 64)

MODEL_TRANSFORM = v2.Compose([
	v2.Grayscale(num_output_channels=MODEL_N_CHANNELS),
	v2.Resize(size=TRANSFORM_DATA_SIZE),
	v2.ToImage(),
	v2.ToDtype(torch.float32, scale=True)
])

IMAGE_FILEPATH = Path('IMG.JPG')
IMAGE_CLASSES = 'АВГКМНЕ'
IMAGE_N_CLASSES = len(IMAGE_CLASSES)

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
if MODEL_DEVICE == 'cuda':
	torch.cuda.manual_seed(RANDOM_SEED)

In [4]:
def walk_through_dataset(top):
	result = dict()
	for dirpath, _, filenames in top.walk():
		if dirpath.stem != top.stem:
			info = {'filepath': dirpath, 'n_images': len(filenames)}
			result[dirpath.stem] = info
	return result

def dataset_train_test_split(dataset_paths=MODEL_DATASET_PATH, train_size=DATALOADER_TRAIN_SIZE, random_seed=RANDOM_SEED):
	dataset_walk_info = walk_through_dataset(dataset_paths)
	model_class_to_id = dict(zip(MODEL_CLASSES, range(MODEL_N_CLASSES)))

	images_train_target, images_train_data = [], []
	images_test_target, images_test_data = [], []

	for image_class, image_dataset in dataset_walk_info.items():
		image_target = model_class_to_id[image_class]
	
		n_images = image_dataset['n_images']
		n_train_images = int(train_size*n_images)
	
		random.seed(random_seed)
		image_dataset = list(image_dataset['filepath'].glob('*.jpeg'))
		random.shuffle(image_dataset)
	
		for n_image, filepath in enumerate(image_dataset):
			image_handle = Image.open(filepath)
			image = image_handle.copy()
			image_handle.close()
			
			if n_image <= n_train_images:
				images_train_target.append(image_target)
				images_train_data.append(image)
			else:
				images_test_target.append(image_target)
				images_test_data.append(image)

	return images_train_data, images_test_data, images_train_target, images_test_target

In [5]:
class ImageDataset(Dataset):
	def __init__(self, data, target, transform=None) -> None:
		self.data = data
		self.target = target
		self.transform = transform

	def __len__(self) -> int:
		return len(self.target)

	def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:
		data_id = self.transform(self.data[index])
		target_id = self.target[index]

		return data_id, target_id

In [6]:
def read_image_and_extract_letters(filepath=IMAGE_FILEPATH, margin=TRANSFORM_MARGIN_SIZE):
	image = cv.imread(filepath)
	image = cv.rotate(image, cv.ROTATE_90_COUNTERCLOCKWISE)
	image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
	image = cv.medianBlur(image, 3)
	image = cv.adaptiveThreshold(image, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY_INV, 31, 11)

	kernel = cv.getStructuringElement(cv.MORPH_ELLIPSE, (5, 5))
	image = cv.morphologyEx(image, cv.MORPH_CLOSE, kernel, iterations=3)

	image_contours, _ = cv.findContours(image, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
	image_contours = sorted(image_contours, key=lambda image_contour: cv.boundingRect(image_contour)[0])

	image_letters = []
	for image_contour in image_contours:
		x, y, width, height = cv.boundingRect(image_contour)

		if width < 80 or height < 80:
			continue

		image_letter = image[y-margin:y+height+margin, x-margin:x+width+margin].copy()
		image_letter = cv.bitwise_not(image_letter)
		image_letters.append(Image.fromarray(image_letter))

	return image_letters

In [7]:
class TinyVGG(nn.Module):
	"""
	Model architecture copying TinyVGG from: 
	https://poloclub.github.io/cnn-explainer/
	"""
	def __init__(self, in_channels: int, hidden_units: int, out_features: int) -> None:
		super().__init__()
		self.tinymodel = nn.Sequential(
			# Conv2d Block 1
			nn.Conv2d(in_channels, hidden_units, kernel_size=3, padding=1),
			nn.ReLU(),
			nn.Conv2d(hidden_units, hidden_units, kernel_size=3, padding=1),
			nn.ReLU(),
			nn.MaxPool2d(2),

			# Conv2d Block 1
			nn.Conv2d(hidden_units, hidden_units, kernel_size=3, padding=1),
			nn.ReLU(),
			nn.Conv2d(hidden_units, hidden_units, kernel_size=3, padding=1),
			nn.ReLU(),
			nn.MaxPool2d(2),

			# Classifier Block
			nn.Flatten(),
			nn.Linear(hidden_units*16*16, out_features)
		)

	def forward(self, x: torch.Tensor):
		return self.tinymodel(x)

In [8]:
def train_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               model_loss: torch.nn.Module,
               model_optimizer: torch.optim.Optimizer):

	model.train()
	train_loss, train_acc = 0, 0
    
	for X, y in dataloader:
		X, y = X.to(MODEL_DEVICE), y.to(MODEL_DEVICE)

		y_pred = model(X)
		loss = model_loss(y_pred, y)
		
		train_loss += loss.item()
		y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
		train_acc += 100*(y_pred_class == y).sum().item()/len(y_pred)
		
		model_optimizer.zero_grad()
		loss.backward()
		model_optimizer.step()

	train_loss /= len(dataloader)
	train_acc /= len(dataloader)
	
	return train_loss, train_acc

def test_step(model: torch.nn.Module,
              dataloader: torch.utils.data.DataLoader,
              model_loss: torch.nn.Module,
              model_optimizer):
    
	model.eval()
	test_loss, test_acc = 0, 0

	with torch.inference_mode(): 
		for X, y in dataloader:
			X, y = X.to(MODEL_DEVICE), y.to(MODEL_DEVICE)
            
			y_pred = model(X)
			loss = model_loss(y_pred, y)

			test_loss += loss.item()
			y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
			test_acc += 100*(y_pred_class == y).sum().item()/len(y_pred)

		test_loss /= len(dataloader)
		test_acc /= len(dataloader)

		return test_loss, test_acc

In [9]:
def predict_image_letters(model, transform, letters, image_classes, model_classes):
	test_acc_list = []
	model.eval()
	with torch.inference_mode():
		for letter_id, letter in enumerate(letters):
			letter_pred = model(transform(letter).unsqueeze(dim=0).to(MODEL_DEVICE))
			
			letter_pred_class = torch.argmax(torch.softmax(letter_pred, dim=1), dim=1)
			letter_pred_class = model_classes[letter_pred_class]
			
			letter_class = image_classes[letter_id]
			print(letter_class, '=', letter_pred_class, end=', ')
			test_acc_list.append(letter_class == letter_pred_class)
	print()
	return sum(test_acc_list)

In [10]:
def main():
	# Train Test Split Raw Data
	IMAGES_TRAIN_DATA, IMAGES_TEST_DATA, IMAGES_TRAIN_TARGET, IMAGES_TEST_TARGET = dataset_train_test_split()

	# Init Custom Train Test Pytorch Datasets
	IMAGE_TRAIN_DATASET = ImageDataset(IMAGES_TRAIN_DATA, IMAGES_TRAIN_TARGET, MODEL_TRANSFORM)
	IMAGE_TEST_DATASET = ImageDataset(IMAGES_TEST_DATA, IMAGES_TEST_TARGET, MODEL_TRANSFORM)

	# Init Custom Train Test Pytorch Dataloaders
	IMAGE_TRAIN_DATALOADER = DataLoader(dataset=IMAGE_TRAIN_DATASET, batch_size=DATALOADER_BATCH_SIZE, shuffle=True)
	IMAGE_TEST_DATALOADER = DataLoader(dataset=IMAGE_TEST_DATASET, batch_size=DATALOADER_BATCH_SIZE)

	# Init Custom TinyVGG Multiclass CNN
	model = TinyVGG(in_channels=MODEL_N_CHANNELS, hidden_units=MODEL_HIDDEN_UNITS, out_features=MODEL_N_CLASSES).to(MODEL_DEVICE)
	model_loss = nn.CrossEntropyLoss()
	model_optimizer = torch.optim.AdamW(params=model.parameters(), lr=1e-3, weight_decay=1e-4)

	# Init Given Image Letters
	letters = read_image_and_extract_letters(IMAGE_FILEPATH)

	# Run Train Test Loop
	epoch, result_acc = 0, 0
	while result_acc != IMAGE_N_CLASSES:
		train_loss, train_acc = train_step(model, IMAGE_TRAIN_DATALOADER, model_loss, model_optimizer)
		test_loss, test_acc = test_step(model, IMAGE_TEST_DATALOADER, model_loss, model_optimizer)
		result_acc = predict_image_letters(model, MODEL_TRANSFORM, letters, IMAGE_CLASSES, MODEL_CLASSES)
		print(f"Epoch: {epoch:>2} | Train loss: {train_loss:.5f} | Train accuracy: {train_acc:.2f}%", end=' | ')
		print(f"Test loss: {test_loss:.5f} | Test accuracy: {test_acc:.2f}% | Result accuaracy {result_acc} / {IMAGE_N_CLASSES}")
		epoch += 1

	# Keep Model State
	# torch.save(obj=model.state_dict(), f=MODEL_FILEPATH)
	return model

In [None]:
if __name__ == '__main__':
	main()