In [None]:
# Colab-ready complete script (Option B: OpenCV-based weather augmentations)
# Run in Google Colab with GPU runtime.
# -----------------------------------------------------------------------------
# 1) Environment setup (keep minimal; avoid albumentations/imgaug)
# -----------------------------------------------------------------------------
!nvidia-smi || true
!pip install --quiet editdistance wget matplotlib opencv-python

# -----------------------------------------------------------------------------
# 2) Imports
# -----------------------------------------------------------------------------
import os, sys, time, math, random, shutil, zipfile, glob
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import editdistance
import wget

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# -----------------------------------------------------------------------------
# 3) Utilities
# -----------------------------------------------------------------------------
def mkdir(path):
    os.makedirs(path, exist_ok=True)

def download_and_extract(url, out_dir, filename=None):
    mkdir(out_dir)
    if filename is None:
        filename = os.path.basename(url)
    out_path = os.path.join(out_dir, filename)
    if not os.path.exists(out_path):
        print("Downloading", url)
        try:
            wget.download(url, out=out_path)
            print()
        except Exception as e:
            print("Download failed:", e)
            return None
    else:
        print("Already downloaded:", out_path)
    # try unzip
    try:
        if out_path.endswith(".zip"):
            print("Extracting", out_path)
            with zipfile.ZipFile(out_path, 'r') as z:
                z.extractall(out_dir)
    except Exception as e:
        print("Extraction error (maybe tar.gz?)", e)
    return out_path

# -----------------------------------------------------------------------------
# 4) Attempt to download IIIT5K and SVT (optional). If download fails, upload dataset to DATA_ROOT.
# -----------------------------------------------------------------------------
DATA_ROOT = "/content/text_rec_data"
mkdir(DATA_ROOT)

iiit_url = "https://cvit.iiit.ac.in/research/projects/cvit-projects/scene-text-recognition/data/IIIT5KWord_V3.zip"
svt_url  = "https://vision.ucsd.edu/~kai/svt/SVT.zip"

# Try downloads (if links broken, user can upload datasets to DATA_ROOT)
try:
    download_and_extract(iiit_url, DATA_ROOT)
except Exception as e:
    print("IIIT download attempt failed:", e)
try:
    download_and_extract(svt_url, DATA_ROOT)
except Exception as e:
    print("SVT download attempt failed:", e)

# -----------------------------------------------------------------------------
# 5) Locate image-label pairs (flexible parser)
# -----------------------------------------------------------------------------
def find_images_and_labels(root):
    pairs = []
    root = Path(root)
    # find common annotation .txt files
    txt_files = list(root.rglob("*.txt"))
    for tf in txt_files:
        name = tf.name.lower()
        if "train" in name or "test" in name or "gt" in name or "label" in name:
            try:
                with open(tf, "r", encoding="utf8", errors="ignore") as f:
                    lines = [l.strip() for l in f if l.strip()]
                for l in lines:
                    parts = l.split()
                    if len(parts) >= 2:
                        imgname = parts[0]
                        word = parts[1]
                        # Locate image file near tf or in root
                        candidate_paths = [
                            tf.parent / imgname,
                            root / imgname,
                            tf.parent / (imgname + ".jpg"),
                            root / (imgname + ".jpg"),
                            tf.parent / (imgname + ".png"),
                            root / (imgname + ".png"),
                        ]
                        found = False
                        for c in candidate_paths:
                            if c.exists():
                                pairs.append((str(c), word))
                                found = True
                                break
                        # if not found, attempt to find any file whose stem matches imgname
                        if not found:
                            for ext in [".jpg",".png",".jpeg",".JPG"]:
                                cand = tf.parent / (imgname + ext)
                                if cand.exists():
                                    pairs.append((str(cand), word))
                                    found = True
                                    break
            except Exception:
                pass
    # fallback: if none found, try to use image files with .txt sidecar files
    if len(pairs) == 0:
        for img in root.rglob("*.*"):
            if img.suffix.lower() in [".jpg",".png",".jpeg"]:
                lab = img.with_suffix(".txt")
                if lab.exists():
                    try:
                        w = lab.read_text().strip().splitlines()[0].strip()
                        pairs.append((str(img), w))
                    except:
                        continue
    return pairs

pairs = find_images_and_labels(DATA_ROOT)
print("Found annotation pairs:", len(pairs))
if len(pairs) == 0:
    print("No pairs found automatically. Upload datasets (IIIT5K / SVT) into /content/text_rec_data and re-run.")

# Trim to reasonable size for Colab if very large
MAX_SAMPLES = 20000
if len(pairs) > MAX_SAMPLES:
    random.shuffle(pairs)
    pairs = pairs[:MAX_SAMPLES]
    print("Sampling down to", len(pairs), "pairs.")

# Train/Val/Test split
random.shuffle(pairs)
n = len(pairs)
ntrain = int(0.8 * n)
nval = int(0.1 * n)
train_pairs = pairs[:ntrain]
val_pairs = pairs[ntrain:ntrain+nval]
test_pairs = pairs[ntrain+nval:]
print("Train/Val/Test:", len(train_pairs), len(val_pairs), len(test_pairs))

# -----------------------------------------------------------------------------
# 6) Character vocabulary
# -----------------------------------------------------------------------------
alphabet = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!@#$%^&*()-_+=/:;.,?'\" "
char_to_idx = {c: i+1 for i, c in enumerate(alphabet)}  # 0 reserved for blank (CTC)
idx_to_char = {i+1: c for i, c in enumerate(alphabet)}

def encode_str_to_tensor(s):
    seq = []
    for ch in s:
        if ch in char_to_idx:
            seq.append(char_to_idx[ch])
        elif ch.lower() in char_to_idx:
            seq.append(char_to_idx[ch.lower()])
    if len(seq) == 0:
        seq = [char_to_idx.get(" ",1)]
    return torch.LongTensor(seq)

def decode_seq(indices):
    # collapse repeats and remove blanks (0)
    res = []
    prev = None
    for idx in indices:
        if idx == 0:
            prev = None
            continue
        if idx == prev:
            continue
        prev = idx
        res.append(idx_to_char.get(int(idx), ""))
    return "".join(res)

# -----------------------------------------------------------------------------
# 7) OpenCV-based weather augmentation functions (no external libs)
# -----------------------------------------------------------------------------
def add_snow(img, snow_strength=0.04):
    h, w = img.shape[:2]
    snow = (np.random.rand(h, w) < snow_strength).astype(np.uint8) * 255
    snow = cv2.blur(snow, (3,3))
    snow = np.expand_dims(snow, 2)
    img = img.astype(np.float32)
    img = np.clip(img + snow, 0, 255)
    return img.astype(np.uint8)

def add_rain(img, drop_length=15, drop_width=1, drop_count=400, slant=10):
    h, w = img.shape[:2]
    rain = np.zeros((h, w), dtype=np.uint8)
    for _ in range(drop_count):
        x = random.randint(0, w-1)
        y = random.randint(0, h-1)
        x2 = min(w-1, x + slant)
        y2 = min(h-1, y + drop_length)
        cv2.line(rain, (x,y), (x2,y2), 255, drop_width)
    rain = cv2.blur(rain, (3,3))
    rain = np.expand_dims(rain, 2)
    img = img.astype(np.float32)
    img = np.clip(img + rain, 0, 255)
    # motion blur to emulate streaks
    ksize = random.choice([3,5,7])
    kernel = np.zeros((ksize, ksize))
    kernel[int((ksize-1)/2), :] = np.ones(ksize)
    kernel = kernel / ksize
    img = cv2.filter2D(img, -1, kernel)
    return img.astype(np.uint8)

def add_fog(img, fog_strength=0.06):
    h, w = img.shape[:2]
    fog = np.full((h, w, 3), 255, dtype=np.uint8)
    alpha = np.random.uniform(0.02, fog_strength)
    img = img.astype(np.float32)
    out = cv2.addWeighted(img, 1-alpha, fog.astype(np.float32), alpha, 0)
    return out.astype(np.uint8)

def random_weather_augment(img, img_h=32, img_w=128):
    # input img: RGB uint8
    img_copy = img.copy()
    p = random.random()
    if p < 0.18:
        img_copy = add_rain(img_copy,
                            drop_length=random.randint(10,25),
                            drop_count=random.randint(150,700),
                            slant=random.randint(-10,10))
    elif p < 0.30:
        img_copy = add_snow(img_copy, snow_strength=random.uniform(0.01, 0.06))
    elif p < 0.44:
        img_copy = add_fog(img_copy, fog_strength=random.uniform(0.04, 0.18))
    # brightness / contrast
    if random.random() < 0.5:
        alpha = random.uniform(0.7, 1.4)  # contrast
        beta = random.uniform(-30, 30)    # brightness
        img_copy = cv2.convertScaleAbs(img_copy, alpha=alpha, beta=beta)
    # blur occasionally
    if random.random() < 0.3:
        k = random.choice([3,5,7])
        img_copy = cv2.GaussianBlur(img_copy, (k,k), 0)
    # JPEG compression artifact simulation
    if random.random() < 0.25:
        encode_param = [int(cv2.IMWRITE_JPEG_QUALITY), random.randint(30,90)]
        _, encimg = cv2.imencode('.jpg', img_copy, encode_param)
        img_copy = cv2.imdecode(encimg, cv2.IMREAD_COLOR)
    img_copy = cv2.resize(img_copy, (img_w, img_h))
    return img_copy

# -----------------------------------------------------------------------------
# 8) Dataset and collate
# -----------------------------------------------------------------------------
class WordDataset(Dataset):
    def __init__(self, pairs, img_h=32, img_w=128, augment=False):
        self.pairs = pairs
        self.img_h = img_h
        self.img_w = img_w
        self.augment = augment
    def __len__(self):
        return len(self.pairs)
    def __getitem__(self, idx):
        img_path, label = self.pairs[idx]
        img = cv2.imread(img_path, cv2.IMREAD_COLOR)
        if img is None:
            img = np.zeros((self.img_h, self.img_w, 3), dtype=np.uint8)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        if self.augment:
            try:
                img = random_weather_augment(img, self.img_h, self.img_w)
            except Exception:
                img = cv2.resize(img, (self.img_w, self.img_h)).astype(np.uint8)
        else:
            img = cv2.resize(img, (self.img_w, self.img_h)).astype(np.uint8)
        # grayscale and normalize to [-1,1]
        img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY).astype(np.float32) / 255.0
        img_gray = np.expand_dims(img_gray, axis=0)  # C,H,W
        img_t = (torch.FloatTensor(img_gray) - 0.5) / 0.5
        # encode label
        label_enc = []
        for ch in label:
            if ch in char_to_idx:
                label_enc.append(char_to_idx[ch])
            elif ch.lower() in char_to_idx:
                label_enc.append(char_to_idx[ch.lower()])
        if len(label_enc) == 0:
            label_enc = [char_to_idx.get(" ",1)]
        label_enc = torch.LongTensor(label_enc)
        label_len = label_enc.numel()
        return img_t, label_enc, label_len, label

def collate_fn(batch):
    imgs = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    label_lens = torch.LongTensor([item[2] for item in batch])
    raw_labels = [item[3] for item in batch]
    imgs_t = torch.stack(imgs)
    concat = torch.cat(labels) if len(labels)>0 else torch.LongTensor([])
    return imgs_t, concat, label_lens, raw_labels

# -----------------------------------------------------------------------------
# 9) CRNN model (CNN -> BiLSTM -> CTC)
# -----------------------------------------------------------------------------
class BidirectionalLSTM(nn.Module):
    def __init__(self, nIn, nHidden, nOut):
        super(BidirectionalLSTM, self).__init__()
        self.rnn = nn.LSTM(nIn, nHidden, bidirectional=True)
        self.embedding = nn.Linear(nHidden*2, nOut)
    def forward(self, input):
        recurrent, _ = self.rnn(input)
        T, b, h = recurrent.size()
        t_rec = recurrent.view(T*b, h)
        output = self.embedding(t_rec)
        output = output.view(T, b, -1)
        return output

class CRNN(nn.Module):
    def __init__(self, imgH, nc, nclass, nh):
        super(CRNN, self).__init__()
        assert imgH % 16 == 0
        ks = [3,3,3,3,3,3,2]
        ps = [1,1,1,1,1,1,0]
        ss = [1,1,1,1,1,1,1]
        nm = [64,128,256,256,512,512,512]
        cnn = nn.Sequential()
        def convRelu(i, batch_norm=False):
            nIn = nc if i==0 else nm[i-1]
            nOut = nm[i]
            cnn.add_module('conv{0}'.format(i), nn.Conv2d(nIn, nOut, ks[i], ss[i], ps[i]))
            if batch_norm:
                cnn.add_module('bn{0}'.format(i), nn.BatchNorm2d(nOut))
            cnn.add_module('relu{0}'.format(i), nn.ReLU(True))
        convRelu(0)
        cnn.add_module('pooling0', nn.MaxPool2d(2,2))
        convRelu(1)
        cnn.add_module('pooling1', nn.MaxPool2d(2,2))
        convRelu(2, batch_norm=True)
        convRelu(3)
        cnn.add_module('pooling2', nn.MaxPool2d((2,1),(2,1),(0,0)))
        convRelu(4, batch_norm=True)
        convRelu(5)
        cnn.add_module('pooling3', nn.MaxPool2d((2,1),(2,1),(0,0)))
        convRelu(6, batch_norm=True)
        self.cnn = cnn
        self.rnn = nn.Sequential(
            BidirectionalLSTM(512, nh, nh),
            BidirectionalLSTM(nh, nh, nclass)
        )
    def forward(self, x):
        conv = self.cnn(x)
        b, c, h, w = conv.size()
        conv = conv.squeeze(2)  # B, C, W
        conv = conv.permute(2,0,1)  # W, B, C
        output = self.rnn(conv)
        return output

# -----------------------------------------------------------------------------
# 10) Training and evaluation helpers
# -----------------------------------------------------------------------------
def train_epoch(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0.0
    count = 0
    for imgs, labels_concat, label_lens, raw_labels in dataloader:
        imgs = imgs.to(device)
        labels_concat = labels_concat.to(device)
        label_lens = label_lens.to(device)
        batch_size = imgs.size(0)
        preds = model(imgs)  # T, B, C
        preds_log_softmax = F.log_softmax(preds, dim=2)
        preds_len = torch.IntTensor([preds.size(0)] * batch_size)
        loss = criterion(preds_log_softmax, labels_concat, preds_len, label_lens)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()
        total_loss += loss.item()
        count += 1
    return total_loss / max(count,1)

def greedy_decode(preds):
    _, maxidx = preds.max(2)
    maxidx = maxidx.transpose(1,0).cpu().numpy()  # B, T
    results = []
    for row in maxidx:
        res = []
        prev = -1
        for ch in row:
            if ch != prev and ch != 0:
                res.append(idx_to_char.get(int(ch), ''))
            prev = ch
        results.append("".join(res))
    return results

def evaluate(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    total_cer = 0.0
    with torch.no_grad():
        for imgs, labels_concat, label_lens, raw_labels in dataloader:
            imgs = imgs.to(device)
            batch_size = imgs.size(0)
            preds = model(imgs)
            preds_log = F.log_softmax(preds, dim=2)
            decoded = greedy_decode(preds_log)
            for p, g in zip(decoded, raw_labels):
                total += 1
                if p.strip().lower() == g.strip().lower():
                    correct += 1
                total_cer += editdistance.eval(p, g) / max(1, len(g))
    word_acc = correct / total if total>0 else 0.0
    avg_cer = total_cer / total if total>0 else 0.0
    return word_acc, avg_cer

# -----------------------------------------------------------------------------
# 11) Create datasets / loaders
# -----------------------------------------------------------------------------
IMG_H = 32
IMG_W = 128
BATCH_SIZE = 64

train_ds = WordDataset(train_pairs, img_h=IMG_H, img_w=IMG_W, augment=True)
val_ds   = WordDataset(val_pairs, img_h=IMG_H, img_w=IMG_W, augment=False)
test_ds  = WordDataset(test_pairs, img_h=IMG_H, img_w=IMG_W, augment=False)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, num_workers=2)

print("Train/Val/Test dataset sizes:", len(train_ds), len(val_ds), len(test_ds))

# -----------------------------------------------------------------------------
# 12) Instantiate model, loss, optimizer
# -----------------------------------------------------------------------------
nclass = len(alphabet) + 1
nh = 256
model = CRNN(IMG_H, 1, nclass, nh).to(device)
criterion = nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
print(model)

# -----------------------------------------------------------------------------
# 13) Training loop
# -----------------------------------------------------------------------------
EPOCHS = 20
best_val_acc = 0.0
history = {"train_loss": [], "val_acc": [], "val_cer": []}

for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    train_loss = train_epoch(model, train_loader, criterion, optimizer)
    val_acc, val_cer = evaluate(model, val_loader)
    history["train_loss"].append(train_loss)
    history["val_acc"].append(val_acc)
    history["val_cer"].append(val_cer)
    scheduler.step()
    t1 = time.time()
    print(f"Epoch {epoch}/{EPOCHS}  train_loss: {train_loss:.4f}  val_acc: {val_acc*100:.2f}%  val_cer: {val_cer:.3f}  time: {t1-t0:.1f}s")
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save({"model_state": model.state_dict(), "alphabet": alphabet}, "/content/best_crnn.pth")
        print("Saved best model.")

torch.save({"model_state": model.state_dict(), "alphabet": alphabet}, "/content/final_crnn.pth")

# -----------------------------------------------------------------------------
# 14) Plot validation accuracy vs epochs and training loss
# -----------------------------------------------------------------------------
if len(history["val_acc"])>0:
    plt.figure(figsize=(8,4))
    plt.plot(range(1,1+len(history["val_acc"])), [x*100 for x in history["val_acc"]], marker='o')
    plt.title("Validation Word Accuracy vs Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Val Word Accuracy (%)")
    plt.grid(True)
    plt.show()

if len(history["train_loss"])>0:
    plt.figure(figsize=(8,4))
    plt.plot(range(1,1+len(history["train_loss"])), history["train_loss"], marker='o')
    plt.title("Train Loss vs Epochs")
    plt.xlabel("Epoch")
    plt.ylabel("Train Loss")
    plt.grid(True)
    plt.show()

# -----------------------------------------------------------------------------
# 15) Final evaluation and sample predictions
# -----------------------------------------------------------------------------
if os.path.exists("/content/best_crnn.pth"):
    best = torch.load("/content/best_crnn.pth", map_location=device)
    model.load_state_dict(best["model_state"])

test_acc, test_cer = evaluate(model, test_loader)
print(f"Final Test word accuracy: {test_acc*100:.2f}%, Test CER: {test_cer:.3f}")

# Show sample predictions
model.eval()
samples = 12
count = 0
plt.figure(figsize=(12,6))
for imgs, labels_concat, label_lens, raw_labels in test_loader:
    imgs_cpu = imgs
    imgs = imgs.to(device)
    with torch.no_grad():
        preds = model(imgs)
        decoded = greedy_decode(F.log_softmax(preds, dim=2))
    for i in range(len(decoded)):
        if count >= samples: break
        img = imgs_cpu[i].cpu().numpy().transpose(1,2,0)
        img = (img * 0.5) + 0.5
        img = img.squeeze()
        plt.subplot(3,4,count+1)
        plt.imshow(img, cmap='gray')
        plt.title(f"P: {decoded[i]}\nG: {raw_labels[i]}")
        plt.axis('off')
        count += 1
    if count >= samples:
        break
plt.tight_layout()
plt.show()

print("Best model saved to: /content/best_crnn.pth")
print("Final model saved to: /content/final_crnn.pth")
print("\nNotes:")
print("- This version uses OpenCV-only augmentations to simulate weather (rain, snow, fog) and related artifacts.")
print("- If dataset downloads failed, upload IIIT5K / SVT (or any scene text dataset) into /content/text_rec_data")
print("- For better accuracy, pretrain on large synthetic datasets (MJ, SynthText) and use attention decoders or beam search.")


Mon Nov 24 18:04:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.12/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.12/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.12/dist-package

AttributeError: _ARRAY_API not found

ImportError: numpy.core.multiarray failed to import