#  Load and Preprocess the Data

In [13]:
import pandas as pd
import cv2
import numpy as np
import os

csv_path = r"D:\New folder\english.csv"
image_dir = r"D:\New folder\Img"

df = pd.read_csv(csv_path)
print(df.head()) # Take a look at the structure

                image label
0  Img/img001-001.png     0
1  Img/img001-002.png     0
2  Img/img001-003.png     0
3  Img/img001-004.png     0
4  Img/img001-005.png     0


# Extract image names and labels

In [14]:
image_files = df['image'].tolist() # Assuming a column named 'image_name'
labels = df['label'].tolist()     # Assuming a column named 'label_code'

# Load and preprocess images

In [15]:
image_list = []
for img_file in image_files:
    # Check if the filename starts with "Img/" and remove it if it does
    if img_file.startswith("Img/"):
        base_filename = img_file[len("Img/"):]
    else:
        base_filename = img_file

    # Now join the base filename with the image directory
    img_path = os.path.join(image_dir, base_filename)
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) # Load as grayscale
    if img is not None:
        # Resize the image (adjust dimensions as needed for your model)
        resized_img = cv2.resize(img, (64, 64))
        # Normalize pixel values
        normalized_img = resized_img / 255.0
        image_list.append(normalized_img)
    else:
        print(f"Error loading image: {img_path}")

# Convert the list of images to a NumPy array
X = np.array(image_list)

# Prepare labels

In [16]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels) # Convert string labels to numerical
# If your model requires one-hot encoding:
# from tensorflow.keras.utils import to_categorical
# y = to_categorical(y)

# Split the Data

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and Train Your Deep Learning Model

In [18]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# Reshape the input data for CNN (assuming grayscale images)
X_train = X_train.reshape(-1, 64, 64, 1)
X_test = X_test.reshape(-1, 64, 64, 1)

num_classes = len(np.unique(y)) # Determine the number of unique classes

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax') # Softmax for multi-class classification
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', # Or 'categorical_crossentropy' if using one-hot encoding
              metrics=['accuracy'])

# Train the model
epochs = 10 # Adjust as needed
model.fit(X_train, y_train, epochs=epochs, validation_split=0.1)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 33ms/step - accuracy: 0.0385 - loss: 4.0751 - val_accuracy: 0.1465 - val_loss: 3.5015
Epoch 2/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.3161 - loss: 2.7202 - val_accuracy: 0.3590 - val_loss: 2.5778
Epoch 3/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.5845 - loss: 1.5860 - val_accuracy: 0.4322 - val_loss: 2.3242
Epoch 4/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.7441 - loss: 0.9440 - val_accuracy: 0.5092 - val_loss: 2.1381
Epoch 5/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 34ms/step - accuracy: 0.8448 - loss: 0.5545 - val_accuracy: 0.5385 - val_loss: 2.1371
Epoch 6/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.9230 - loss: 0.3178 - val_accuracy: 0.5385 - val_loss: 2.2648
Epoch 7/10
[1m77/77[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x28ca0967580>

# Evaluate the Model

In [19]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5996 - loss: 2.8216
Test loss: 2.6705
Test accuracy: 0.6041


# Model Testing

In [21]:
pip install pytesseract pillow


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pillow
  Downloading pillow-11.2.1-cp310-cp310-win_amd64.whl.metadata (9.1 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pillow-11.2.1-cp310-cp310-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   --- ------------------------------------ 0.3/2.7 MB ? eta -:--:--
   --- ------------------------------------ 0.3/2.7 MB ? eta -:--:--
   ------- -------------------------------- 0.5/2.7 MB 621.2 kB/s eta 0:00:04
   ------- -------------------------------- 0.5/2.7 MB 621.2 kB/s eta 0:00:04
   ----------- ---------------------------- 0.8/2.7 MB 657.8 kB/s eta 0:00:03
   --------------- ------------------------ 1.0/2.7 MB 739.8 kB/s eta 0:00:03
   ----------------------- ---------------- 1.6/2.7 MB 975.2 kB/s eta 0:00:02
   --------------------------- ------



In [None]:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [28]:
import pytesseract
from PIL import Image

# For Windows: specify path to tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # <- Update this path if different

# Set the path to the image
image_path = "Img\img062-055.png"

# Load the image
image = Image.open(image_path)

# Use pytesseract to extract text
predicted_text = pytesseract.image_to_string(image, config='--psm 8')

# Clean and print the result
predicted_text = predicted_text.strip()
print("Predicted word:", predicted_text)


Predicted word: 3)


# WER and CER test sets

In [6]:
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
from jiwer import wer, cer
from PIL import Image
import torch
from tqdm import tqdm
import os

# Load model and processor
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Set the path to the folder containing test images
image_folder = r"D:\New folder\Img"
image_files = sorted([
    f for f in os.listdir(image_folder)
    if f.endswith(('.png', '.jpg', '.jpeg'))
])

predictions = []
ground_truths = []  # You must manually fill this or load from file

for img_file in tqdm(image_files, desc="Evaluating"):
    img_path = os.path.join(image_folder, img_file)

    # Load and preprocess image
    image = Image.open(img_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)

    # Generate prediction
    with torch.no_grad():
        generated_ids = model.generate(pixel_values)
        pred_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

    predictions.append(pred_text)

    # ⚠️ Replace with actual ground truth for real evaluation
    ground_truths.append("actual ground truth text here")

# Compute final WER and CER
final_wer = wer(ground_truths, predictions)
final_cer = cer(ground_truths, predictions)

print("\n--- Evaluation Results ---")
print(f"Final WER on test set: {final_wer:.4f}")
print(f"Final CER on test set: {final_cer:.4f}")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod


--- Evaluation Results ---
Final WER on test set: 1.0249
Final CER on test set: 0.9590
