<a href="https://colab.research.google.com/github/asheta66/Machine-Learning-2024/blob/main/Egyptian_Hieroglyphs/Egyptian_Hieroglyphs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===========================
# Egyptian Hieroglyphs CNN: end-to-end (NO validation split)
# - Auto-detect annotations CSVs and images
# - Train/test split only
# - Simple CNN
# - Metrics: Accuracy, Precision, Recall, F1 (macro)
# - Plots: Training curves, Confusion Matrices (train/test), ROC curves (train/test)
# ===========================

import os
import re
import glob
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay,
                             accuracy_score, precision_score, recall_score, f1_score,
                             roc_curve, auc)

import tensorflow as tf
from tensorflow.keras import layers, models

# ---------------------------
# Config
# ---------------------------
BASE_PATH = "/kaggle/input/egyptian-hieroglyphs"
IMG_SIZE = 128      # 64 or 128 are fine for a starter model
BATCH_SIZE = 32
EPOCHS = 15         # start simple; adjust as needed
SEED = 42
RANDOM_STATE = 42

# ---------------------------
# Utility helpers
# ---------------------------

def list_files(patterns):
    out = []
    for p in patterns:
        out.extend(glob.glob(p, recursive=True))
    return out

def find_annotation_csvs(base_path):
    """
    Search for likely annotation CSVs in root and subfolders.
    We prioritize names containing 'train', 'valid/val', 'test'. If not found,
    we will still try to parse any '*annotation*.csv' files we see.
    """
    candidates = []
    for root, _, files in os.walk(base_path):
        for f in files:
            if f.lower().endswith(".csv") and "annot" in f.lower():
                candidates.append(os.path.join(root, f))

    # Also some datasets use train_annotations.csv / valid_annotations.csv etc.
    # Add those if present.
    for f in ("train_annotations.csv", "valid_annotations.csv", "val_annotations.csv", "test_annotations.csv"):
        fp = os.path.join(base_path, f)
        if os.path.isfile(fp) and fp not in candidates:
            candidates.append(fp)

    # As a last resort, some exports use exactly "_annotations.csv" under split folders
    for split in ("train", "valid", "val", "test"):
        fp = os.path.join(base_path, split, "_annotations.csv")
        if os.path.isfile(fp) and fp not in candidates:
            candidates.append(fp)

    return sorted(set(candidates))

def parse_annotations_csv(path):
    """
    Attempt to parse a CSV into a (filepath, label) DataFrame for classification.
    Supports common schema variants:
      - columns like ['file','filename','image','image_path','path']
      - label columns like ['label','class','category','name']
    If the CSV looks like detection/segmentation (multiple rows per image),
    we take the first class occurrence per image for a single-label classification baseline.
    """
    df_raw = pd.read_csv(path)
    # find image path column
    cand_img_cols = [c for c in df_raw.columns if c.lower() in
                     ["file", "filename", "image", "image_path", "path", "img_path", "imagefile"]]
    if not cand_img_cols:
        # try to guess by file-ish content
        for c in df_raw.columns:
            if df_raw[c].astype(str).str.contains(r"\.(jpg|jpeg|png|bmp|gif)$", case=False, na=False).any():
                cand_img_cols.append(c)
                break
    if not cand_img_cols:
        raise ValueError(f"Could not find an image path column in {path}")
    img_col = cand_img_cols[0]

    # find label column
    cand_label_cols = [c for c in df_raw.columns if c.lower() in
                       ["label", "class", "category", "name"]]
    if not cand_label_cols:
        # some annotation formats store 'class' in another column name; try to heuristically detect
        # e.g., 'class_name', 'category_name'
        for c in df_raw.columns:
            if "class" in c.lower() or "label" in c.lower() or "category" in c.lower():
                cand_label_cols.append(c)
                break
    if not cand_label_cols:
        raise ValueError(f"Could not find a label/class column in {path}")
    label_col = cand_label_cols[0]

    df = df_raw[[img_col, label_col]].copy()
    df.columns = ["image", "label"]

    # If duplicated rows per image (e.g., detection), reduce to single label per image (first)
    df = df.groupby("image", as_index=False).first()

    return df

def resolve_image_path(base_path, rel_or_abs):
    """
    Turn an annotation 'image' entry into an actual file on disk.
    - If absolute and exists, return as-is.
    - If relative, try base_path/<that>, base_path/train/<that>, base_path/valid/<that>, base_path/test/<that>
    - Also try common 'images' subfolder insertions.
    """
    p = str(rel_or_abs)
    if os.path.isabs(p) and os.path.isfile(p):
        return p

    # direct join
    cand = os.path.join(base_path, p)
    if os.path.isfile(cand):
        return cand

    # Try common split folders and images subfolders
    trials = []
    for split in ("train", "valid", "val", "test"):
        trials.append(os.path.join(base_path, split, p))
        trials.append(os.path.join(base_path, split, "images", p))

    # Sometimes p already contains 'train/images/...'
    trials.append(os.path.join(base_path, p.lstrip("/")))

    for t in trials:
        if os.path.isfile(t):
            return t

    # As a fallback, if p is a basename, search for it anywhere under base_path
    base = os.path.basename(p)
    hits = list_files([os.path.join(base_path, "**", base)])
    if hits:
        return hits[0]

    return None

def dataframe_from_annotations(base_path, csv_paths):
    frames = []
    for csvp in csv_paths:
        try:
            df = parse_annotations_csv(csvp)
        except Exception as e:
            print(f"Skipping {csvp}: {e}")
            continue

        # Resolve image paths
        df["image"] = df["image"].apply(lambda s: resolve_image_path(base_path, s))
        before = len(df)
        df = df[df["image"].notnull() & df["image"].apply(os.path.isfile)]
        dropped = before - len(df)
        if dropped > 0:
            print(f"[{os.path.basename(csvp)}] Dropped {dropped} rows with unresolved image paths.")
        frames.append(df)

    if not frames:
        raise RuntimeError("No valid annotation CSVs could be parsed.")

    df_all = pd.concat(frames, ignore_index=True).drop_duplicates(subset=["image"])
    return df_all

def dataframe_from_dirs(base_path):
    """
    Fallback: infer (image, label) pairs from directory structure:
      base_path/{train,valid,val,test,images}/*class*/image.*
    If no split dirs, will scan all subdirs for class folders.
    """
    patterns = []
    for split in ("train", "valid", "val", "test"):
        # Common: base_path/split/<class>/*.jpg
        patterns.append(os.path.join(base_path, split, "*", "*.*"))
        # Alternate: base_path/split/images/<class>/*.jpg
        patterns.append(os.path.join(base_path, split, "images", "*", "*.*"))
    # Global fallback: base_path/*class*/*.*
    patterns.append(os.path.join(base_path, "*", "*.*"))
    paths = list_files(patterns)

    rows = []
    for p in paths:
        if re.search(r"\.(jpg|jpeg|png|bmp|gif)$", p, flags=re.I):
            # label = parent folder name (one above), but if 'images' present, take the folder above it
            parts = os.path.normpath(p).split(os.sep)
            if "images" in parts:
                idx = parts.index("images")
                if idx + 2 <= len(parts) - 1:
                    label = parts[idx + 1]
                else:
                    continue
            else:
                # class folder is parent directory
                label = parts[-2]
            rows.append((p, label))
    if not rows:
        raise RuntimeError("Could not infer (image,label) pairs from directories.")
    df = pd.DataFrame(rows, columns=["image", "label"]).drop_duplicates(subset=["image"])
    return df

def load_image_tf(path, img_size=IMG_SIZE):
    img = tf.io.read_file(path)
    # Try decode: default 3 channels RGB
    img = tf.image.decode_image(img, channels=3, expand_animations=False)
    img = tf.image.convert_image_dtype(img, tf.float32)  # to [0,1]
    img = tf.image.resize(img, (img_size, img_size), antialias=True)
    return img

def make_tf_dataset(paths, labels, batch_size=BATCH_SIZE, shuffle=False):
    ds_paths = tf.data.Dataset.from_tensor_slices(np.array(paths))
    ds_imgs = ds_paths.map(lambda p: load_image_tf(p), num_parallel_calls=tf.data.AUTOTUNE)
    ds_labels = tf.data.Dataset.from_tensor_slices(labels)
    ds = tf.data.Dataset.zip((ds_imgs, ds_labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(paths), seed=SEED, reshuffle_each_iteration=True)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

def build_simple_cnn(num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3,3), activation='relu', padding="same", input_shape=(IMG_SIZE, IMG_SIZE, 3)),
        layers.MaxPooling2D(2),

        layers.Conv2D(64, (3,3), activation='relu', padding="same"),
        layers.MaxPooling2D(2),

        layers.Conv2D(128, (3,3), activation='relu', padding="same"),
        layers.MaxPooling2D(2),

        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

def plot_training_curves(history):
    fig1 = plt.figure(figsize=(6,4))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.title('Training Loss'); plt.xlabel('Epoch'); plt.ylabel('Loss'); plt.legend(); plt.grid(True)
    plt.tight_layout()
    plt.show()

    fig2 = plt.figure(figsize=(6,4))
    plt.plot(history.history['accuracy'], label='Train Acc')
    plt.title('Training Accuracy'); plt.xlabel('Epoch'); plt.ylabel('Accuracy'); plt.legend(); plt.grid(True)
    plt.tight_layout()
    plt.show()

def evaluate_and_report(name, y_true, y_prob, class_names):
    """
    Compute metrics and plot confusion matrix + ROC curves.
    Returns dict of metrics.
    """
    y_pred = np.argmax(y_prob, axis=1)
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro', zero_division=0)
    rec = recall_score(y_true, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

    print(f"\n=== {name} Metrics ===")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f} (macro)")
    print(f"Recall   : {rec:.4f} (macro)")
    print(f"F1       : {f1:.4f} (macro)")

    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred, labels=range(len(class_names)))
    fig_cm, ax = plt.subplots(1, 1, figsize=(5.5, 4.5))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
    disp.plot(ax=ax, colorbar=False, cmap="Greens")
    ax.set_title(f"{name} Confusion Matrix")
    plt.tight_layout()
    plt.show()

    # ROC Curves (one-vs-rest)
    y_true_bin = label_binarize(y_true, classes=range(len(class_names)))
    # Some classes may not appear in split; guard against degenerate curves
    fpr = dict(); tpr = dict(); roc_auc = dict()
    for i in range(len(class_names)):
        if y_true_bin[:, i].max() == 0:
            # class i not present in ground truth -> skip
            continue
        fpr[i], tpr[i], _ = roc_curve(y_true_bin[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # micro-average
    if y_true_bin.sum() > 0:
        fpr["micro"], tpr["micro"], _ = roc_curve(y_true_bin.ravel(), y_prob.ravel())
        roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

    # macro-average (average of AUCs over classes that exist)
    valid_aucs = [roc_auc[i] for i in roc_auc.keys() if isinstance(i, int)]
    macro_auc = np.mean(valid_aucs) if valid_aucs else np.nan

    fig_roc = plt.figure(figsize=(6,5))
    # plot each class curve (only those with positives)
    for i in range(len(class_names)):
        if i in roc_auc:
            plt.plot(fpr[i], tpr[i], label=f"Class {class_names[i]} (AUC={roc_auc[i]:.3f})")
    # optional micro
    if "micro" in roc_auc:
        plt.plot(fpr["micro"], tpr["micro"], linestyle="--", label=f"Micro-average (AUC={roc_auc['micro']:.3f})")

    plt.plot([0,1], [0,1], linestyle=':', label='Chance')
    plt.title(f"{name} ROC Curves (macro AUC={macro_auc:.3f})")
    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
    plt.legend(fontsize=8, loc="lower right")
    plt.grid(True, linestyle=":")
    plt.tight_layout()
    plt.show()

    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1, "macro_auc": macro_auc}

# ---------------------------
# 1) Discover annotations and build (image,label) table
# ---------------------------
print("🔎 Searching for annotation CSVs...")
csvs = find_annotation_csvs(BASE_PATH)
for c in csvs:
    print(" -", c)

df_all = None
if csvs:
    try:
        df_all = dataframe_from_annotations(BASE_PATH, csvs)
    except Exception as e:
        print("⚠️ Failed to parse annotation CSVs robustly. Falling back to folder-based discovery. Reason:", e)

if df_all is None:
    print("📂 Falling back: inferring labels from directory structure.")
    df_all = dataframe_from_dirs(BASE_PATH)

# Keep only existing image files
df_all = df_all[df_all["image"].apply(lambda p: isinstance(p, str) and os.path.isfile(p))].copy()
df_all = df_all.drop_duplicates(subset=["image"]).reset_index(drop=True)

print(f"\nTotal unique images found: {len(df_all)}")
print("Example rows:")
display(df_all.sample(min(5, len(df_all)), random_state=SEED))

# ---------------------------
# 2) Encode labels and split into train/test (NO validation)
# ---------------------------
le = LabelEncoder()
df_all["label_enc"] = le.fit_transform(df_all["label"].astype(str))
class_names = list(le.classes_)
num_classes = len(class_names)
print(f"\nClasses ({num_classes}): {class_names}")

# Stratified split
train_df, test_df = train_test_split(
    df_all[["image", "label_enc"]],
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=df_all["label_enc"]
)

print(f"\nTrain size: {len(train_df)}")
print(f"Test  size: {len(test_df)}")

# ---------------------------
# 3) Build tf.data pipelines
# ---------------------------
train_paths = train_df["image"].tolist()
train_labels = train_df["label_enc"].values

test_paths = test_df["image"].tolist()
test_labels = test_df["label_enc"].values

train_ds = make_tf_dataset(train_paths, train_labels, shuffle=True)
test_ds  = make_tf_dataset(test_paths,  test_labels,  shuffle=False)

# ---------------------------
# 4) Build and train model (NO validation)
# ---------------------------
model = build_simple_cnn(num_classes)
model.summary()

history = model.fit(
    train_ds,
    epochs=EPOCHS,
    verbose=1
)

# Training curves (loss + accuracy)
plot_training_curves(history)

# ---------------------------
# 5) Evaluate on Train and Test (metrics + confusion matrices + ROC)
# ---------------------------
# Get probabilities
train_probs = model.predict(train_ds, verbose=0)
test_probs  = model.predict(test_ds,  verbose=0)

# True labels for batches (preserve original order)
y_train_true = np.concatenate([y.numpy() for _, y in train_ds.unbatch()])
y_test_true  = np.concatenate([y.numpy() for _, y in test_ds.unbatch()])

# Metrics + Plots for Train
train_metrics = evaluate_and_report("Train", y_train_true, train_probs, class_names)

# Metrics + Plots for Test
test_metrics = evaluate_and_report("Test", y_test_true, test_probs, class_names)

# ---------------------------
# 6) Print a compact summary table
# ---------------------------
summary = pd.DataFrame([
    {"Split": "Train", **{k.capitalize(): v for k, v in train_metrics.items()}},
    {"Split": "Test",  **{k.capitalize(): v for k, v in test_metrics.items()}}
])
print("\n=== Summary (macro metrics) ===")
display(summary)

# ---------------------------
# 7) Show a few sample predictions (optional)
# ---------------------------
def show_samples(paths, y_true, y_prob, class_names, k=6, title="Samples"):
    idxs = np.random.default_rng(SEED).choice(len(paths), size=min(k, len(paths)), replace=False)
    fig = plt.figure(figsize=(10, 6))
    fig.suptitle(title)
    for i, idx in enumerate(idxs, 1):
        img = plt.imread(paths[idx])
        pred = np.argmax(y_prob[idx])
        tt = class_names[y_true[idx]]
        pp = class_names[pred]
        plt.subplot(2, (k+1)//2, i)
        try:
            plt.imshow(img)
        except:
            plt.imshow(img, cmap="gray")
        plt.axis("off")
        plt.title(f"T:{tt}\nP:{pp}", fontsize=9)
    plt.tight_layout()
    plt.show()

show_samples(test_paths, y_test_true, test_probs, class_names, k=6, title="Test Samples (T=true, P=pred)")


🔎 Searching for annotation CSVs...
📂 Falling back: inferring labels from directory structure.


RuntimeError: Could not infer (image,label) pairs from directories.