OCR utilizanod SVM y Sklearn para entrenamiento
------------------------------
https://scikit-learn.org/stable/modules/svm.html

In [None]:
import os, glob, sys
import numpy as np
import cv2
from joblib import dump, load

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# ======== CONFIG ========
DATA_DIR = r"..\..\Data\dataset\text_arial"  # carpeta con subdirs 0..9
IMG_SIZE = 28            # imágenes 28x28
RANDOM_STATE = 42
SAVE_PATH = "svm_ocr_arialv3.pkl"
N_JOBS = -1              # usa todos los cores para GridSearchCV

In [None]:
# ======== UTILS ========
def ensure_binary_01(img):
    """Return image in {0,1} with uint8 input possible (0..255)."""
    if img.dtype != np.uint8:
        img = img.astype(np.uint8)
    # Otsu binarization for safety
    _, bin_img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    bin01 = (bin_img > 0).astype(np.float32)  # 0/1 float
    return bin01

def maybe_invert(bin01):
    """
    Ensure foreground is 1 and background is 0.
    If the image has mostly ones, assume it is inverted (white bg / black digit) and invert.
    """
    mean_val = bin01.mean()
    # If background dominates: mean should be low; if mean > 0.5, likely inverted
    if mean_val > 0.5:
        return 1.0 - bin01
    return bin01

def load_dataset_from_folders(data_dir, img_size=28):
    """
    Expects directory structure:
      data_dir/
        0/*.png
        1/*.png
        ...
        9/*.png
    Returns X (n_samples, n_features), y (n_samples,)
    """
    X, y = [], []
    classes = [str(i) for i in range(10)]
    total = 0

    for cls in classes:
        cls_dir = os.path.join(data_dir, cls)
        if not os.path.isdir(cls_dir):
            print(f"[WARN] Missing class dir: {cls_dir}")
            continue
        paths = glob.glob(os.path.join(cls_dir, "*.png"))
        for p in paths:
            img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
            if img is None:
                print(f"[WARN] Skipping unreadable: {p}")
                continue
            # Resize just in case
            if img.shape != (img_size, img_size):
                img = cv2.resize(img, (img_size, img_size), interpolation=cv2.INTER_NEAREST)

            bin01 = ensure_binary_01(img)
            bin01 = maybe_invert(bin01)  # ensure foreground=1
            # Raw pixels flattened in [0,1]
            feat = bin01.flatten()

            X.append(feat)
            y.append(int(cls))
            total += 1

    X = np.asarray(X, dtype=np.float32)
    y = np.asarray(y, dtype=np.int64)
    print(f"[INFO] Loaded {total} samples. X shape: {X.shape}, y shape: {y.shape}")
    return X, y

In [None]:
# ======== MAIN TRAINING ========
def main():
    print("[INFO] Loading dataset...")
    X, y = load_dataset_from_folders(DATA_DIR, IMG_SIZE)

    # Split (stratified)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=RANDOM_STATE,
        stratify=y
    )

   
    scaler = "passthrough"

    pipe = Pipeline([
        ("scaler", scaler),
        ("svc", SVC(kernel="rbf", probability=True, class_weight=None, random_state=RANDOM_STATE))
    ])

    # Reasonable grid
    param_grid = {
        "svc__C": [0.5, 1, 2, 4, 8, 16, 32],
        "svc__gamma": ["scale", "auto", 1.0/784, 0.01, 0.02, 0.05, 0.1]
    }

    print("[INFO] Running stratified 5-fold GridSearchCV...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    grid = GridSearchCV(
        pipe, param_grid=param_grid, cv=cv,
        scoring="accuracy", n_jobs=N_JOBS, verbose=1
    )
    grid.fit(X_train, y_train)

    print(f"[INFO] Best params: {grid.best_params_}")
    print(f"[INFO] Best CV accuracy: {grid.best_score_:.4f}")

    best = grid.best_estimator_

    # Evaluation
    print("[INFO] Evaluating on TRAIN/TEST...")
    y_pred_train = best.predict(X_train)
    y_pred_test  = best.predict(X_test)

    print(f"[RESULT] Train accuracy: {accuracy_score(y_train, y_pred_train):.4f}")
    print(f"[RESULT] Test  accuracy: {accuracy_score(y_test,  y_pred_test):.4f}")

    print("\n[REPORT] Classification report (TEST):")
    print(classification_report(y_test, y_pred_test, digits=4))

    print("[REPORT] Confusion matrix (TEST):")
    print(confusion_matrix(y_test, y_pred_test))

    # Save model
    dump(best, SAVE_PATH)
    print(f"[INFO] Model saved to: {SAVE_PATH}")

    # Demo: single prediction
    if len(sys.argv) > 1:
        test_img_path = sys.argv[1]
        print(f"[DEMO] Predicting {test_img_path}")
        img = cv2.imread(test_img_path, cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_NEAREST)
        bin01 = ensure_binary_01(img)
        bin01 = maybe_invert(bin01)

        feat = bin01.flatten()

        X_one = np.asarray([feat], dtype=np.float32)
        # Use pipeline directly (handles scaler if present)
        pred = best.predict(X_one)[0]
        proba = best.predict_proba(X_one)[0]
        print(f"[DEMO] Predicted: {pred}, probs: {np.round(proba, 4)}")


In [None]:
main()