In [1]:
import os
import torch
import torchvision
import numpy as np
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt


In [6]:
# ----------------------------------------
# Step 1: Data Loading and Preprocessing
# ----------------------------------------

# Kyrgyz Letters Paths
kyrgyz_train_path = '../data/raw/handwritten_kyrgyz_letters/train'
kyrgyz_test_path = '../data/raw/handwritten_kyrgyz_letters/test'


In [3]:
# Cyrillic Words Dataset
class CyrillicWordsDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.labels = pd.read_csv(csv_file, sep='\t', header=None, names=["image", "label"])
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.labels.iloc[idx, 0])
        image = Image.open(img_name).convert("RGB")
        label = self.labels.iloc[idx, 1]

        if self.transform:
            image = self.transform(image)

        return image, label

In [4]:
# Transformations applied to both datasets (Normalize pixel values between -1 and 1)
trans = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

In [8]:
# Kyrgyz Dataset Loaders
kyrgyz_train_data = torchvision.datasets.ImageFolder(root=kyrgyz_train_path, transform=trans)
kyrgyz_test_data = torchvision.datasets.ImageFolder(root=kyrgyz_test_path, transform=trans)

# Cyrillic Dataset Loaders
cyrillic_train_data = CyrillicWordsDataset(csv_file='../data/raw/cyrilic_words/train.tsv', root_dir='../data/raw/cyrilic_words/train', transform=trans)
cyrillic_test_data = CyrillicWordsDataset(csv_file='../data/raw/cyrilic_words/test.tsv', root_dir='../data/raw/cyrilic_words/test', transform=trans)

# Combine both datasets
combined_train_data = torch.utils.data.ConcatDataset([kyrgyz_train_data, cyrillic_train_data])
combined_test_data = torch.utils.data.ConcatDataset([kyrgyz_test_data, cyrillic_test_data])

# DataLoaders
batch_size = 64
train_dataloader = DataLoader(combined_train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(combined_test_data, batch_size=batch_size, shuffle=False)

# Check device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')


Using cpu device


In [9]:
# ----------------------------------------
# Step 2: Model Definition (CNN)
# ----------------------------------------

# Define a Convolutional Neural Network model for both Kyrgyz letters and Cyrillic words
class KyrgyzCyrillicNet(nn.Module):
    def __init__(self):
        super(KyrgyzCyrillicNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3),  # Conv Layer 1
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # Pooling Layer 1
            nn.Conv2d(16, 32, kernel_size=3),  # Conv Layer 2
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # Pooling Layer 2
            nn.Conv2d(32, 64, kernel_size=3),  # Conv Layer 3
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)  # Pooling Layer 3
        )
        self.flatten = nn.Flatten()  # Flatten the output for the fully connected layers

        # Fully connected layers (final classification)
        self.classifier = nn.Sequential(
            nn.Linear(64*15*15, 2048),  # Dense Layer 1
            nn.ReLU(),
            nn.Dropout(p=0.5),  # Dropout for regularization
            nn.Linear(2048, 512),  # Dense Layer 2
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(512, 1036)  # Output layer (1036 classes: 36 Kyrgyz letters + ~1000 Cyrillic words)
        )

    def forward(self, x):
        x = self.features(x)  # Pass through convolutional layers
        x = self.flatten(x)  # Flatten output
        logits = self.classifier(x)  # Pass through fully connected layers
        return logits

In [10]:
# Initialize the model and move to the correct device (CPU/GPU)
model = KyrgyzCyrillicNet().to(device)
print(model)

KyrgyzCyrillicNet(
  (features): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (classifier): Sequential(
    (0): Linear(in_features=14400, out_features=2048, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=2048, out_features=512, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=512, out_features=1036, bias=True)
  )
)


In [25]:
def train(dataloader, model, loss_fn, optimizer):
    """Train the model for one epoch.
    
    Args:
        dataloader (DataLoader): DataLoader for training data.
        model (nn.Module): The neural network model.
        loss_fn (nn.Module): The loss function.
        optimizer (torch.optim.Optimizer): The optimizer used for training.

    Returns:
        tuple: Average loss and top-1 accuracy for the epoch.
    """
    model.train()  # Set the model to training mode
    total_loss = 0.0
    top1_acc = 0.0
    size = len(dataloader.dataset)

    # Iterate over batches of data
    for batch_idx, (imgs, labels) in enumerate(dataloader):
        imgs, labels = imgs.to(device), labels.to(device)

        # Forward pass: compute model predictions
        pred = model(imgs)

        # Compute loss
        loss = loss_fn(pred, labels)

        # Backward pass: compute gradients and update parameters
        optimizer.zero_grad()  # Clear previous gradients
        loss.backward()  # Compute gradients
        optimizer.step()  # Update model parameters

        # Accumulate loss and calculate top-1 accuracy
        total_loss += loss.item()
        predicted_1 = pred.argmax(1)
        top1_acc += (predicted_1 == labels).float().sum().item()

        # Log progress every 100 batches
        if batch_idx % 100 == 0:
            current = batch_idx * len(imgs)
            print(f"[{current:>5d}/{size:>5d}]")

    return total_loss / size, top1_acc / size


def test(dataloader, model, loss_fn):
    """Evaluate the model on validation or test data.
    
    Args:
        dataloader (DataLoader): DataLoader for validation or test data.
        model (nn.Module): The neural network model.
        loss_fn (nn.Module): The loss function.

    Returns:
        tuple: Average loss and top-1 accuracy for the validation/test set.
    """
    model.eval()  # Set the model to evaluation mode
    size = len(dataloader.dataset)
    total_loss = 0.0
    top1_acc = 0.0

    # No gradient computation needed during evaluation
    with torch.no_grad():
        for imgs, labels in dataloader:
            imgs, labels = imgs.to(device), labels.to(device)

            # Forward pass: compute model predictions
            pred = model(imgs)

            # Compute loss
            loss = loss_fn(pred, labels)

            # Accumulate loss and calculate top-1 accuracy
            total_loss += loss.item()
            predicted_1 = pred.argmax(1)
            top1_acc += (predicted_1 == labels).float().sum().item()

    return total_loss / size, top1_acc / size


In [13]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Create a results folder if it doesn't exist
os.makedirs('../results', exist_ok=True)

In [15]:
# ----------------------------------------
# Step 4: Hyperparameters and Training Loop
# ----------------------------------------

# Define the loss function
# CrossEntropyLoss is used here as it's suitable for classification problems.
# It combines LogSoftmax and Negative Log Likelihood Loss, making it ideal for multi-class classification.
loss_fn = nn.CrossEntropyLoss()

# Define the optimizer
# The Adam optimizer is used because it combines the advantages of two popular optimizers:
# AdaGrad (good at handling sparse gradients) and RMSProp (good at handling non-stationary objectives).
# It's adaptive and works well with a large variety of deep learning tasks.
# The learning rate (lr) is initially set to 1e-3, which is a common starting value that often works well.
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [16]:
# Learning rate scheduler
# The ReduceLROnPlateau scheduler reduces the learning rate when the validation loss plateaus.
# mode='min' means that we want the validation loss to minimize.
# patience=3 means it will wait for 3 epochs without improvement before reducing the learning rate.
# factor=0.5 reduces the learning rate by half.
# verbose=True prints a message whenever the learning rate is reduced.
# This helps in fine-tuning the model as it converges, allowing it to take smaller steps to refine training.
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5, verbose=True)

In [17]:
# Early stopping configuration
# Early stopping helps prevent overfitting by halting training once there is no improvement in validation loss.
# patience=5 means training will stop if there is no improvement for 5 consecutive epochs.
patience = 5

In [18]:
# Initialize the best validation loss to infinity
# This means that initially any computed validation loss will be lower than this value.
best_val_loss = float('inf')

In [26]:
# Counter for how many epochs in a row validation loss has not improved.
# If it reaches the value of 'patience', training will stop.
early_stopping_counter = 0

# Target accuracy for stopping training early if reached
target_accuracy = 0.95  # Stop training if validation accuracy reaches or exceeds 95%


In [20]:
# Dictionary to track results of training and validation across epochs
# This will store the losses and accuracies for later analysis.
res = {
    "train_loss": [],       # To store training loss for each epoch
    "train_top1_acc": [],   # To store training top-1 accuracy for each epoch
    "val_loss": [],         # To store validation loss for each epoch
    "val_top1_acc": [],     # To store validation top-1 accuracy for each epoch
}

In [22]:
# Set the number of epochs
# The model will train for a maximum of 50 epochs, but early stopping may stop it sooner.
# The number of epochs can vary depending on the size and complexity of the dataset and model.
epochs = 50


In [23]:
# Path to save the best model
# This is used to store the model with the lowest validation loss during training.
best_model_path = '../results/models/best_model.pth'


In [27]:
# Training loop
for t in range(epochs):
    print(f"Epoch {t+1}/{epochs}\n{'-'*40}")
    
    # Train phase: Train the model using the training dataset
    train_loss, train_top1 = train(train_dataloader, model, loss_fn, optimizer)
    
    # Validation phase: Evaluate the model using the validation dataset
    val_loss, val_top1 = test(test_dataloader, model, loss_fn)
    
    # Save the results for the current epoch
    res['train_loss'].append(train_loss)
    res['train_top1_acc'].append(train_top1)
    res['val_loss'].append(val_loss)
    res['val_top1_acc'].append(val_top1)

    # Print epoch summary: Show the training and validation loss and accuracy
    print(f"Train Loss: {train_loss:.4f}, Train Top-1 Accuracy: {train_top1:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Top-1 Accuracy: {val_top1:.4f}")
    
    # Learning rate scheduler step
    scheduler.step(val_loss)
    
    # Early stopping and model checkpointing
    if val_loss < best_val_loss:
        print(f"Validation loss improved from {best_val_loss:.4f} to {val_loss:.4f}. Saving model.")
        best_val_loss = val_loss
        early_stopping_counter = 0
        
        # Save the model as the best model so far
        torch.save(model.state_dict(), best_model_path)
    else:
        early_stopping_counter += 1
        print(f"No improvement in validation loss for {early_stopping_counter} consecutive epochs.")
    
    # Check if early stopping is needed based on patience
    if early_stopping_counter >= patience:
        print("Early stopping triggered due to no improvement. Stopping training.")
        break

    # Check if the target accuracy is reached to stop training
    if val_top1 >= target_accuracy:
        print(f"Target accuracy of {target_accuracy*100:.2f}% reached. Stopping training.")
        break

# Print final training status
print('Training Done!')

Epoch 1/50
----------------------------------------


RuntimeError: stack expects each tensor to be equal size, but got [3, 72, 202] at entry 0 and [3, 54, 162] at entry 1