In [41]:
# Import all required libraries
import cv2
import numpy as np
import torch
import torch.nn as nn
from PIL import Image
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForMaskedLM
from typing import List, Tuple, Optional
import logging
from torchvision import transforms
from IPython.display import display, Image as IPythonImage


In [42]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [43]:
class PreprocessingModule:
    """Handles image preprocessing steps."""
    
    @staticmethod
    def preprocess_image(image: np.ndarray) -> np.ndarray:
        """
        Apply preprocessing steps to the input image.
        
        Args:
            image: Input image in BGR format
        Returns:
            Preprocessed image
        """
        try:
            # Convert to grayscale
            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            
            # Apply Gaussian Blur
            blurred = cv2.GaussianBlur(gray, (5, 5), 0)
            
            # Apply adaptive thresholding
            binary = cv2.adaptiveThreshold(
                blurred, 255, 
                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                cv2.THRESH_BINARY_INV, 11, 2
            )
            
            # Apply morphological operations
            kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
            morphed = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel, iterations=1)
            
            return morphed
        except Exception as e:
            logger.error(f"Error in preprocessing: {str(e)}")
            raise


In [44]:
class SegmentationEngine:
    """Handles letter segmentation from the preprocessed image."""
    
    @staticmethod
    def segment_letters(preprocessed_image: np.ndarray) -> List[Tuple[np.ndarray, Tuple[int, int, int, int]]]:
        """
        Segment individual letters from the preprocessed image.
        
        Args:
            preprocessed_image: Binary preprocessed image
        Returns:
            List of tuples containing (letter_image, bounding_box)
        """
        try:
            # Find contours
            contours, _ = cv2.findContours(
                preprocessed_image,
                cv2.RETR_EXTERNAL,
                cv2.CHAIN_APPROX_SIMPLE
            )
            
            # Sort contours left to right
            contours = sorted(contours, key=lambda x: cv2.boundingRect(x)[0])
            
            letter_segments = []
            for contour in contours:
                x, y, w, h = cv2.boundingRect(contour)
                
                # Filter out noise based on size
                if 15 < w < 100 and 15 < h < 100:
                    # Extract letter and add padding
                    letter_image = preprocessed_image[y:y+h, x:x+w]
                    letter_image = cv2.copyMakeBorder(
                        letter_image, 10, 10, 10, 10,
                        cv2.BORDER_CONSTANT, value=255
                    )
                    
                    # Resize to standard size
                    letter_image = cv2.resize(letter_image, (28, 28))
                    letter_segments.append((letter_image, (x, y, w, h)))
            
            return letter_segments
        except Exception as e:
            logger.error(f"Error in segmentation: {str(e)}")
            raise

In [None]:
class CNNRecognitionModel:
    """Handles letter recognition using the CNN model."""
    
    def __init__(self, model_path: str):
        """Initialize the CNN model."""
        self.classes = [
            "А", "Б", "В", "Г", "Д", "Е", "Ж", "З", "И", "Й", "К", "Л", "М", "Н",
            "О", "П", "Р", "С", "Т", "У", "Ф", "Х", "Ц", "Ч", "Ш", "Щ", "Ъ", "Ы",
            "Ь", "Э", "Ю", "Я", "Ң", "Ү", "Ө", "Ё"
        ]
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.transform = self.get_transforms()
        self.model = self.load_model(model_path)

    def create_model(self) -> nn.Module:
        """Create the CNN model architecture exactly matching the saved weights."""
        class KyrgyzCNN(nn.Module):
            """Base CNN architecture for Kyrgyz character recognition."""
            def __init__(self, num_classes=36):
                super().__init__()
                
                # Feature extraction layers
                self.features = nn.Sequential(
                    # First layer: (batch, 3, 112, 112) -> (batch, 16, 112, 112)
                    nn.Conv2d(3, 16, kernel_size=3, padding=1),
                    nn.ReLU(inplace=True),
                    nn.MaxPool2d(2),  # -> (batch, 16, 56, 56)
                    
                    # Second layer: (batch, 16, 56, 56) -> (batch, 32, 56, 56)
                    nn.Conv2d(16, 32, kernel_size=3, padding=1),
                    nn.ReLU(inplace=True),
                    nn.MaxPool2d(2),  # -> (batch, 32, 28, 28)
                    
                    # Third layer: (batch, 32, 28, 28) -> (batch, 16, 28, 28)
                    nn.Conv2d(32, 16, kernel_size=3, padding=1),
                    nn.ReLU(inplace=True)  # Final size: (batch, 16, 28, 28)
                )
                
                # Calculate flattened size: 16 * 28 * 28 = 12544
                self.classifier = nn.Sequential(
                    nn.Linear(12544, 2048),
                    nn.ReLU(inplace=True),
                    nn.Dropout(0.5),
                    nn.Linear(2048, 512),
                    nn.ReLU(inplace=True),
                    nn.Dropout(0.5),
                    nn.Linear(512, num_classes)
                )
            
            def forward(self, x):
                # Print shapes for debugging
                shapes = {}
                shapes['input'] = x.shape
                
                # Ensure input size is correct
                if x.size()[-1] != 112 or x.size()[-2] != 112:
                    x = F.interpolate(x, size=(112, 112), mode='bilinear', align_corners=False)
                    shapes['after_resize'] = x.shape
                
                # Feature extraction
                x = self.features(x)
                shapes['after_features'] = x.shape
                
                # Flatten
                x = torch.flatten(x, 1)
                shapes['after_flatten'] = x.shape
                
                # Classification
                x = self.classifier(x)
                shapes['output'] = x.shape
                
                return x, shapes
        
        return KyrgyzCNN(num_classes=len(self.classes))
    
    def get_transforms(self):
        """Get image transformations matching the verified input size."""
        return transforms.Compose([
            transforms.Resize((112, 112)),  # Changed input size to 112x112
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        ])
    
    def load_model(self, model_path: str) -> nn.Module:
        """Load the trained CNN model with verified architecture."""
        try:
            model = self.create_model()
            state_dict = torch.load(model_path, map_location=self.device)
            
            # Debug information
            dummy_input = torch.randn(1, 3, 112, 112)
            features_output = model.features(dummy_input)
            flattened = features_output.view(1, -1)
            logger.info(f"Input shape: {dummy_input.shape}")
            logger.info(f"Features output shape: {features_output.shape}")
            logger.info(f"Flattened size: {flattened.shape[1]}")
            
            model.load_state_dict(state_dict)
            model = model.to(self.device)
            model.eval()
            
            return model
            
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            raise
    
    def recognize_letter(self, letter_image: np.ndarray) -> Tuple[str, float]:
        """Recognize a single letter using the verified model."""
        try:
            if letter_image.dtype != np.uint8:
                letter_image = (letter_image * 255).astype(np.uint8)
            
            pil_image = Image.fromarray(letter_image).convert('RGB')
            tensor_image = self.transform(pil_image)
            tensor_image = tensor_image.unsqueeze(0).to(self.device)
            
            with torch.no_grad():
                output = self.model(tensor_image)
                probabilities = torch.nn.functional.softmax(output, dim=1)
                confidence, predicted_idx = torch.max(probabilities, 1)
                
                predicted_letter = self.classes[predicted_idx.item()]
                confidence_score = confidence.item()
                
                return predicted_letter, confidence_score
                
        except Exception as e:
            logger.error(f"Error in letter recognition: {str(e)}")
            raise


  state_dict = torch.load(model_path, map_location=self.device)
ERROR:__main__:Error loading model: 'NoneType' object has no attribute 'seek'. You can only torch.load from a file that is seekable. Please pre-load the data into a buffer like io.BytesIO and try to load from it instead.



Verifying network dimensions...


AttributeError: 'NoneType' object has no attribute 'seek'. You can only torch.load from a file that is seekable. Please pre-load the data into a buffer like io.BytesIO and try to load from it instead.

In [46]:
# Debug function to inspect model state
def inspect_model_state(model_path):
    """
    Utility function to inspect the saved model state.
    """
    state_dict = torch.load(model_path, map_location='cpu')
    print("Type of loaded file:", type(state_dict))
    
    if isinstance(state_dict, dict):
        print("\nKeys in the loaded file:")
        for key in state_dict.keys():
            print(f"- {key}")
            
        if 'state_dict' in state_dict:
            print("\nKeys in state_dict:")
            for key in state_dict['state_dict'].keys():
                print(f"- {key}")

In [47]:
# Debug utility to print model architecture
def print_model_structure(model_path: str):
    """Print the structure of the saved model."""
    state_dict = torch.load(model_path, map_location='cpu')
    print("\nModel state dict structure:")
    for key, value in state_dict.items():
        print(f"{key}: {value.shape}")

In [48]:
print_model_structure("../results/final_model.pth")


Model state dict structure:
features.0.weight: torch.Size([16, 3, 3, 3])
features.0.bias: torch.Size([16])
features.3.weight: torch.Size([32, 16, 3, 3])
features.3.bias: torch.Size([32])
features.6.weight: torch.Size([64, 32, 3, 3])
features.6.bias: torch.Size([64])
classifier.0.weight: torch.Size([2048, 12544])
classifier.0.bias: torch.Size([2048])
classifier.3.weight: torch.Size([512, 2048])
classifier.3.bias: torch.Size([512])
classifier.6.weight: torch.Size([36, 512])
classifier.6.bias: torch.Size([36])


  state_dict = torch.load(model_path, map_location='cpu')


In [49]:
class LLMCorrectionModel:
    """Handles word correction using a Language Model."""
    
    def __init__(self, model_name: str = "bert-base-multilingual-cased"):
        """
        Initialize the LLM correction model.
        
        Args:
            model_name: Name of the pretrained model to use
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
        self.model.eval()
    
    def correct_word(self, predicted_word: str) -> str:
        """
        Correct the predicted word using the language model.
        
        Args:
            predicted_word: Initial word prediction
        Returns:
            Corrected word
        """
        try:
            # For simplicity, we'll just return the word for now
            # In a real implementation, you would:
            # 1. Use the LLM to validate the word
            # 2. Get potential corrections
            # 3. Return the most likely correction
            return predicted_word
        except Exception as e:
            logger.error(f"Error in word correction: {str(e)}")
            raise

In [52]:
class KyrgyzWordRecognizer:
    """Main class that orchestrates the entire recognition process."""
    
    def __init__(self, cnn_model_path: str):
        """
        Initialize the word recognizer with all necessary components.
        
        Args:
            cnn_model_path: Path to the trained CNN model
        """
        self.preprocessor = PreprocessingModule()
        self.segmenter = SegmentationEngine()
        self.recognizer = CNNRecognitionModel(cnn_model_path)
        self.corrector = LLMCorrectionModel()
    
    def recognize_word(self, image_path: str, visualize: bool = True) -> str:
        """
        Recognize a word from an image.
        
        Args:
            image_path: Path to the input image
            visualize: Whether to show visualization of the process
        Returns:
            Recognized word
        """
        try:
            # Load image
            image = cv2.imread(image_path)
            if image is None:
                raise ValueError("Could not load image")
            
            # Preprocess
            preprocessed = self.preprocessor.preprocess_image(image)
            
            # Segment letters
            letter_segments = self.segmenter.segment_letters(preprocessed)
            
            # Recognize letters
            predicted_word = ""
            confidences = []
            letter_results = []
            
            for letter_image, bbox in letter_segments:
                letter, confidence = self.recognizer.recognize_letter(letter_image)
                predicted_word += letter
                confidences.append(confidence)
                letter_results.append((letter_image, letter, confidence, bbox))
            
            # Correct word using LLM
            corrected_word = self.corrector.correct_word(predicted_word)
            
            # Visualize if requested
            if visualize:
                self._visualize_results(
                    image, letter_results, predicted_word, corrected_word
                )
            
            return corrected_word
            
        except Exception as e:
            logger.error(f"Error in word recognition: {str(e)}")
            raise
    
    def _visualize_results(
        self,
        original_image: np.ndarray,
        letter_results: List[Tuple],
        predicted_word: str,
        corrected_word: str
    ) -> None:
        """Visualize the recognition results."""
        # Create a copy for visualization
        viz_image = original_image.copy()
        
        # Draw bounding boxes and predictions
        for _, letter, confidence, (x, y, w, h) in letter_results:
            cv2.rectangle(viz_image, (x, y), (x + w, y + h), (0, 255, 0), 2)
            cv2.putText(
                viz_image,
                f"{letter} ({confidence:.2f})",
                (x, y - 10),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.5,
                (0, 255, 0),
                1
            )
        
        # Display results
        plt.figure(figsize=(15, 5))
        
        # Original image with annotations
        plt.subplot(1, 2, 1)
        plt.imshow(cv2.cvtColor(viz_image, cv2.COLOR_BGR2RGB))
        plt.title("Detected Letters")
        plt.axis('off')
        
        # Individual letters
        plt.subplot(1, 2, 2)
        plt.text(0.5, 0.7, f"Predicted: {predicted_word}", 
                ha='center', va='center', fontsize=12)
        plt.text(0.5, 0.3, f"Corrected: {corrected_word}",
                ha='center', va='center', fontsize=12)
        plt.axis('off')
        
        plt.tight_layout()
        plt.show()

In [70]:
recognizer = KyrgyzWordRecognizer("../results/final_model.pth")
result = recognizer.recognize_word("../data/raw/cyrilic_words/combined_word.png")
print(f"Recognition result: {result}")

  state_dict = torch.load(model_path, map_location=self.device)
INFO:__main__:Input shape: torch.Size([1, 3, 112, 112])
INFO:__main__:Features output shape: torch.Size([1, 16, 28, 28])
INFO:__main__:Flattened size: 12544
ERROR:__main__:Error loading model: Error(s) in loading state_dict for KyrgyzCNN:
	size mismatch for features.6.weight: copying a param with shape torch.Size([64, 32, 3, 3]) from checkpoint, the shape in current model is torch.Size([16, 32, 3, 3]).
	size mismatch for features.6.bias: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([16]).


RuntimeError: Error(s) in loading state_dict for KyrgyzCNN:
	size mismatch for features.6.weight: copying a param with shape torch.Size([64, 32, 3, 3]) from checkpoint, the shape in current model is torch.Size([16, 32, 3, 3]).
	size mismatch for features.6.bias: copying a param with shape torch.Size([64]) from checkpoint, the shape in current model is torch.Size([16]).