In [1]:
import os
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [5]:
# Paths to Kyrgyz letters and Russian words
kyrgyz_train_dir = '../data/raw/handwritten_kyrgyz_letters/train'
kyrgyz_test_dir = '../data/raw/handwritten_kyrgyz_letters/test'
russian_train_tsv = '../data/raw/cyrilic_words/train.tsv'
russian_train_dir = '../data/raw/cyrilic_words/train'
russian_test_tsv = '../data/raw/cyrilic_words/test.tsv'
russian_test_dir = '../data/raw/cyrilic_words/test'


In [6]:
# Function to load images and labels from Kyrgyz letters dataset
def load_kyrgyz_letters(data_dir):
    data = []
    labels = []
    for letter_folder in os.listdir(data_dir):
        letter_path = os.path.join(data_dir, letter_folder)
        if os.path.isdir(letter_path):
            for img_name in os.listdir(letter_path):
                img_path = os.path.join(letter_path, img_name)
                try:
                    img = Image.open(img_path)
                    img = img.convert('L')  # Convert to grayscale
                    img = img.resize((32, 32))  # Resize image to fixed size (example: 32x32)
                    img_array = np.array(img)
                    data.append(img_array)
                    labels.append(letter_folder)  # Use the folder name as the label
                except Exception as e:
                    print(f"Error loading image {img_path}: {e}")
    return np.array(data), np.array(labels)

# Load Kyrgyz letter data
X_kyrgyz_train, y_kyrgyz_train = load_kyrgyz_letters(kyrgyz_train_dir)
X_kyrgyz_test, y_kyrgyz_test = load_kyrgyz_letters(kyrgyz_test_dir)

print(f"Kyrgyz training set: {X_kyrgyz_train.shape}, Labels: {y_kyrgyz_train.shape}")
print(f"Kyrgyz test set: {X_kyrgyz_test.shape}, Labels: {y_kyrgyz_test.shape}")


Kyrgyz training set: (62301, 32, 32), Labels: (62301,)
Kyrgyz test set: (17838, 32, 32), Labels: (17838,)


In [7]:
# Function to load Russian word data from TSV and image directory
def load_russian_words(tsv_file, img_dir):
    data = []
    labels = []
    df = pd.read_csv(tsv_file, delimiter='\t', header=None, names=['filename', 'word'])
    
    for idx, row in df.iterrows():
        img_path = os.path.join(img_dir, row['filename'])
        try:
            img = Image.open(img_path)
            img = img.convert('L')  # Convert to grayscale
            img = img.resize((128, 32))  # Resize for word images (example: 128x32)
            img_array = np.array(img)
            data.append(img_array)
            labels.append(row['word'])  # The word is the label
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
    
    return np.array(data), np.array(labels)

# Load Russian word data
X_russian_train, y_russian_train = load_russian_words(russian_train_tsv, russian_train_dir)
X_russian_test, y_russian_test = load_russian_words(russian_test_tsv, russian_test_dir)

print(f"Russian training set: {X_russian_train.shape}, Labels: {y_russian_train.shape}")
print(f"Russian test set: {X_russian_test.shape}, Labels: {y_russian_test.shape}")


Russian training set: (72286, 32, 128), Labels: (72286,)
Russian test set: (1544, 32, 128), Labels: (1544,)


In [8]:
# Normalize the datasets
X_kyrgyz_train = X_kyrgyz_train / 255.0
X_kyrgyz_test = X_kyrgyz_test / 255.0

X_russian_train = X_russian_train / 255.0
X_russian_test = X_russian_test / 255.0


In [14]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Reshape the data to add the channel dimension
X_kyrgyz_train = X_kyrgyz_train.reshape(X_kyrgyz_train.shape[0], 32, 32, 1)
X_kyrgyz_test = X_kyrgyz_test.reshape(X_kyrgyz_test.shape[0], 32, 32, 1)

# Apply ImageDataGenerator augmentation
datagen = ImageDataGenerator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.2
)

# Fit the augmentation generator
datagen.fit(X_kyrgyz_train)


In [16]:
# Use train_test_split without stratify
X_kyrgyz_train, X_kyrgyz_val, y_kyrgyz_train, y_kyrgyz_val = train_test_split(
    X_kyrgyz_train, y_kyrgyz_train, test_size=0.2
)

X_russian_train, X_russian_val, y_russian_train, y_russian_val = train_test_split(
    X_russian_train, y_russian_train, test_size=0.2
)


In [19]:
# Save preprocessed Kyrgyz letters data
np.save('../data/processed/handwritten_kyrgyz_letters/X_kyrgyz_train.npy', X_kyrgyz_train)
np.save('../data/processed/handwritten_kyrgyz_letters/y_kyrgyz_train.npy', y_kyrgyz_train)
np.save('../data/processed/handwritten_kyrgyz_letters/X_kyrgyz_val.npy', X_kyrgyz_val)
np.save('../data/processed/handwritten_kyrgyz_letters/y_kyrgyz_val.npy', y_kyrgyz_val)
np.save('../data/processed/handwritten_kyrgyz_letters/X_kyrgyz_test.npy', X_kyrgyz_test)
np.save('../data/processed/handwritten_kyrgyz_letters/y_kyrgyz_test.npy', y_kyrgyz_test)

# Save preprocessed Russian words data
np.save('../data/processed/cyrillic_words/X_russian_train.npy', X_russian_train)
np.save('../data/processed/cyrillic_words/y_russian_train.npy', y_russian_train)
np.save('../data/processed/cyrillic_words/X_russian_val.npy', X_russian_val)
np.save('../data/processed/cyrillic_words/y_russian_val.npy', y_russian_val)
np.save('../data/processed/cyrillic_words/X_russian_test.npy', X_russian_test)
np.save('../data/processed/cyrillic_words/y_russian_test.npy', y_russian_test)