In [1]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import shutil

from PIL import Image, ImageOps
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
#   Clonar el dataset en local

!git clone https://huggingface.co/datasets/HamdiJr/Egyptian_hieroglyphs

Cloning into 'Egyptian_hieroglyphs'...
remote: Enumerating objects: 4386, done.[K
remote: Total 4386 (delta 0), reused 0 (delta 0), pack-reused 4386 (from 1)[K
Receiving objects: 100% (4386/4386), 10.99 MiB | 18.48 MiB/s, done.
Resolving deltas: 100% (2/2), done.


---

In [3]:
#   Hacer una copia local del las carpetas

def copy_cloned_folder(source_folder, destination_folder):
    """
    Copia una carpeta clonada a un destino local.

    :param source_folder: Ruta de la carpeta clonada.
    :param destination_folder: Ruta de destino donde se copiará la carpeta.
    """
    if not os.path.exists(source_folder):
        print(f"La carpeta fuente no existe: {source_folder}")
        return

    if os.path.exists(destination_folder):
        print(f"La carpeta destino ya existe: {destination_folder}")
        return

    try:
        # Copiar la carpeta completa
        shutil.copytree(source_folder, destination_folder)
        print(f"Carpeta copiada exitosamente de '{source_folder}' a '{destination_folder}'")
    except Exception as e:
        print(f"Error al copiar la carpeta: {e}")

In [4]:
source = "Egyptian_hieroglyphs/Dataset"
destination = "data"

In [5]:
copy_cloned_folder(source, destination)

Carpeta copiada exitosamente de 'Egyptian_hieroglyphs/Dataset' a 'data'


---

In [6]:
def move_images_to_parent_and_delete_subfolders(parent_folder):
    """
    Mueve todas las imágenes de las subcarpetas a la carpeta madre y elimina las subcarpetas.

    :param parent_folder: Ruta de la carpeta madre que contiene las subcarpetas.
    """
    for root, subdirs, files in os.walk(parent_folder, topdown=False):  # Recorre de abajo hacia arriba
        for file in files:
            # Verificar si es una imagen válida
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                source_path = os.path.join(root, file)
                destination_path = os.path.join(parent_folder, file)
                
                # Mover el archivo, renombrar si hay conflicto
                if os.path.exists(destination_path):
                    base, ext = os.path.splitext(file)
                    new_name = f"{base}_copy{ext}"
                    destination_path = os.path.join(parent_folder, new_name)
                
                shutil.move(source_path, destination_path)
                print(f"Movido: {source_path} -> {destination_path}")

        # Eliminar la subcarpeta si está vacía
        if root != parent_folder:  # No eliminar la carpeta madre
            try:
                os.rmdir(root)
                print(f"Eliminada carpeta: {root}")
            except OSError:
                print(f"No se pudo eliminar {root} (posiblemente no está vacía)")


In [7]:
parent_folder = "data/train/"
move_images_to_parent_and_delete_subfolders(parent_folder)

Movido: data/train/Y2/200323_Y2.png -> data/train/200323_Y2.png
Movido: data/train/Y2/030171_Y2.png -> data/train/030171_Y2.png
Movido: data/train/Y2/230203_Y2.png -> data/train/230203_Y2.png
Movido: data/train/Y2/050198_Y2.png -> data/train/050198_Y2.png
Movido: data/train/Y2/050194_Y2.png -> data/train/050194_Y2.png
Movido: data/train/Y2/200072_Y2.png -> data/train/200072_Y2.png
Movido: data/train/Y2/030334_Y2.png -> data/train/030334_Y2.png
Eliminada carpeta: data/train/Y2
Movido: data/train/V6/220176_V6.png -> data/train/220176_V6.png
Movido: data/train/V6/410329_V6.png -> data/train/410329_V6.png
Eliminada carpeta: data/train/V6
Movido: data/train/Y5/200047_Y5.png -> data/train/200047_Y5.png
Movido: data/train/Y5/220422_Y5.png -> data/train/220422_Y5.png
Movido: data/train/Y5/200027_Y5.png -> data/train/200027_Y5.png
Movido: data/train/Y5/220402_Y5.png -> data/train/220402_Y5.png
Eliminada carpeta: data/train/Y5
Movido: data/train/F18/410320_F18.png -> data/train/410320_F18.png
Mo

---

In [8]:
#   Convertir las imágenes .png en formato .jpg

def convert_png_to_jpg(input_dir, output_dir):
    """
    Convierte todas las imágenes PNG en una carpeta (y sus subcarpetas) a formato JPG.

    :param input_dir: Carpeta raíz con imágenes PNG.
    :param output_dir: Carpeta donde se guardarán las imágenes JPG convertidas.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for root, _, files in os.walk(input_dir):
        # Crear la misma estructura de carpetas en el directorio de salida
        relative_path = os.path.relpath(root, input_dir)
        save_path = os.path.join(output_dir, relative_path)
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        for file in files:
            if file.lower().endswith(".png"):  # Solo procesar archivos PNG
                file_path = os.path.join(root, file)
                try:
                    # Abrir imagen PNG
                    with Image.open(file_path) as img:
                        # Convertir a modo RGB (necesario para JPG)
                        rgb_img = img.convert("RGB")
                        # Guardar como JPG
                        jpg_path = os.path.join(save_path, os.path.splitext(file)[0] + ".jpg")
                        rgb_img.save(jpg_path, "JPEG")
                        print(f"Convertido: {file_path} -> {jpg_path}")
                except Exception as e:
                    print(f"Error al procesar {file_path}: {e}")

In [9]:
input_directory = "data"
output_directory = "data_jpg"

In [10]:
convert_png_to_jpg(input_directory, output_directory)

Convertido: data/test/070202_I9.png -> data_jpg/test/070202_I9.jpg
Convertido: data/test/070275_X1.png -> data_jpg/test/070275_X1.jpg
Convertido: data/test/070372_F31.png -> data_jpg/test/070372_F31.jpg
Convertido: data/test/070151_O28.png -> data_jpg/test/070151_O28.jpg
Convertido: data/test/070068_V31.png -> data_jpg/test/070068_V31.jpg
Convertido: data/test/070212_G7.png -> data_jpg/test/070212_G7.jpg
Convertido: data/test/070216_X1.png -> data_jpg/test/070216_X1.jpg
Convertido: data/test/070062_O50.png -> data_jpg/test/070062_O50.jpg
Convertido: data/test/070235_Z1.png -> data_jpg/test/070235_Z1.jpg
Convertido: data/test/070315_N35.png -> data_jpg/test/070315_N35.jpg
Convertido: data/test/070177_R8.png -> data_jpg/test/070177_R8.jpg
Convertido: data/test/070446_S29.png -> data_jpg/test/070446_S29.jpg
Convertido: data/test/070172_V31.png -> data_jpg/test/070172_V31.jpg
Convertido: data/test/070311_D36.png -> data_jpg/test/070311_D36.jpg
Convertido: data/test/070097_N35.png -> data_j

---

In [11]:
#   Borrar las carpetas inútiles
def delete_folder(folder_path):
    """
    Elimina una carpeta y todas sus subcarpetas y archivos.

    :param folder_path: Ruta de la carpeta a eliminar.
    """
    if os.path.exists(folder_path):
        try:
            shutil.rmtree(folder_path)
            print(f"Carpeta eliminada exitosamente: {folder_path}")
        except Exception as e:
            print(f"Error al eliminar la carpeta: {e}")
    else:
        print(f"La carpeta no existe: {folder_path}")


In [12]:
folder_to_delete01 = "data"
folder_to_delete02 = "Egyptian_hieroglyphs"

In [13]:
delete_folder(folder_to_delete01)

Carpeta eliminada exitosamente: data


In [14]:
delete_folder(folder_to_delete02)

Carpeta eliminada exitosamente: Egyptian_hieroglyphs


---

In [15]:
parent_folder = "data_jpg"
move_images_to_parent_and_delete_subfolders(parent_folder)

Movido: data_jpg/test/070216_X1.jpg -> data_jpg/070216_X1.jpg
Movido: data_jpg/test/070062_O50.jpg -> data_jpg/070062_O50.jpg
Movido: data_jpg/test/070212_G7.jpg -> data_jpg/070212_G7.jpg
Movido: data_jpg/test/070068_V31.jpg -> data_jpg/070068_V31.jpg
Movido: data_jpg/test/070151_O28.jpg -> data_jpg/070151_O28.jpg
Movido: data_jpg/test/070275_X1.jpg -> data_jpg/070275_X1.jpg
Movido: data_jpg/test/070372_F31.jpg -> data_jpg/070372_F31.jpg
Movido: data_jpg/test/070202_I9.jpg -> data_jpg/070202_I9.jpg
Movido: data_jpg/test/070311_D36.jpg -> data_jpg/070311_D36.jpg
Movido: data_jpg/test/070446_S29.jpg -> data_jpg/070446_S29.jpg
Movido: data_jpg/test/070315_N35.jpg -> data_jpg/070315_N35.jpg
Movido: data_jpg/test/070177_R8.jpg -> data_jpg/070177_R8.jpg
Movido: data_jpg/test/070172_V31.jpg -> data_jpg/070172_V31.jpg
Movido: data_jpg/test/070235_Z1.jpg -> data_jpg/070235_Z1.jpg
Movido: data_jpg/test/070146_N35.jpg -> data_jpg/070146_N35.jpg
Movido: data_jpg/test/070336_S29.jpg -> data_jpg/070

---

In [16]:
def process_images_to_dataframe(image_folder):
    """
    Crea un DataFrame a partir de las imágenes en una carpeta.
    Cada fila contiene los valores de píxeles de una imagen, la clase derivada del nombre del archivo,
    y usa la parte inicial del nombre del archivo como índice.

    :param image_folder: Ruta a la carpeta con imágenes.
    :return: DataFrame con valores de píxeles, clase e índice personalizado.
    """
    data = []

    for file in os.listdir(image_folder):
        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            # Ruta completa del archivo
            file_path = os.path.join(image_folder, file)
            
            # Cargar la imagen y convertirla a escala de grises
            with Image.open(file_path) as img:
                img = img.convert('L')  # Convertir a escala de grises
                img_array = np.array(img).flatten()  # Aplanar la imagen
            
            # Extraer clase y nombre del índice
            try:
                name_part = file.split("_")[0]
                class_part = file.split("_")[1].split(".")[0]
            except IndexError:
                print(f"Formato de nombre inesperado: {file}")
                continue

            # Crear fila con valores de píxeles y clase
            data.append([name_part, class_part, *img_array])

    # Crear el DataFrame
    column_names = ['indice', 'clase'] + [f'pixel_{i}' for i in range(len(img_array))]
    df = pd.DataFrame(data, columns=column_names)

    # Establecer la columna 'indice' como índice del DataFrame
    df.set_index('indice', inplace=True)

    return df

In [17]:
image_folder = "data_jpg"

In [18]:
df = process_images_to_dataframe(image_folder)

In [19]:
df

Unnamed: 0_level_0,clase,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,...,pixel_3740,pixel_3741,pixel_3742,pixel_3743,pixel_3744,pixel_3745,pixel_3746,pixel_3747,pixel_3748,pixel_3749
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
410275,V13,222,223,225,226,225,223,221,219,225,...,221,221,222,223,223,224,225,225,224,225
220139,N35,209,209,209,209,209,209,209,209,208,...,209,209,209,209,209,209,209,209,206,207
050146,X1,204,203,202,202,201,200,199,199,198,...,203,203,203,203,203,203,203,203,202,202
030018,M17,176,177,179,181,180,179,177,175,177,...,184,183,181,181,183,183,181,180,184,183
220041,N35,201,201,201,200,200,199,199,199,198,...,199,199,198,197,197,196,195,195,196,196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
050096,N35,200,200,199,199,199,200,201,201,199,...,196,195,195,194,194,195,195,196,197,197
030434,M17,199,199,200,201,202,202,203,203,197,...,198,199,200,200,198,198,199,200,199,199
390349,S29,213,210,208,210,214,217,217,215,209,...,221,225,228,226,222,219,219,220,221,221
410081,G43,207,210,209,207,208,214,216,215,216,...,210,216,219,215,212,213,215,214,217,215


In [20]:
df.to_csv("eg_hiero_raw.csv", index=True)

In [21]:
target = "clase"

In [22]:
conteo = df['clase'].value_counts()
print(conteo)

clase
N35    448
M17    364
S29    265
X1     232
G43    197
      ... 
N16      1
F30      1
G10      1
F32      1
N19      1
Name: count, Length: 172, dtype: int64


---

In [23]:
def augment_and_expand_dataset(df, image_folder, min_instances=20):
    """
    Aumenta el número de imágenes para las clases con menos de 'min_instances' instancias,
    y las integra al DataFrame original.
    
    :param df: DataFrame original con columnas ['clase', 'pixel_0', ..., 'pixel_n'].
    :param image_folder: Ruta a la carpeta donde se almacenan las imágenes originales.
    :param min_instances: Número mínimo deseado de instancias por clase.
    :return: DataFrame actualizado con las nuevas imágenes.
    """
    new_data = []
    class_counts = df['clase'].value_counts()

    for clase, count in class_counts.items():
        if count < min_instances:
            # Número de imágenes adicionales necesarias
            num_to_add = min_instances - count

            # Filtrar imágenes de la clase actual
            class_images = df[df['clase'] == clase].index.tolist()
            
            for _ in range(num_to_add):
                # Seleccionar una imagen aleatoria de la clase
                random_index = random.choice(class_images)
                random_image_name = random_index + f"_{clase}.jpg"
                random_image_path = os.path.join(image_folder, random_image_name)
                
                # Cargar la imagen original
                with Image.open(random_image_path) as img:
                    # Aplicar transformación sencilla para simular una imagen nueva (ejemplo: voltear horizontalmente)
                    augmented_image = ImageOps.mirror(img)

                    # Crear un nuevo nombre para la imagen
                    random_digits = ''.join(random.choices("0123456789", k=6))
                    new_image_name = f"{random_digits}_{clase}.jpg"
                    new_image_path = os.path.join(image_folder, new_image_name)

                    # Guardar la nueva imagen
                    augmented_image.save(new_image_path)
                
                # Convertir la imagen aumentada a un vector de píxeles
                img_array = np.array(augmented_image.convert('L')).flatten()

                # Crear nueva fila para el DataFrame
                new_row = [random_digits, clase, *img_array]
                new_data.append(new_row)

    # Crear DataFrame con los nuevos datos
    if new_data:
        column_names = ['indice', 'clase'] + [f'pixel_{i}' for i in range(len(new_data[0]) - 2)]
        new_df = pd.DataFrame(new_data, columns=column_names)
        new_df.set_index('indice', inplace=True)
        df = pd.concat([df, new_df])

    return df

In [24]:
image_folder = "data_jpg"

In [25]:
df = process_images_to_dataframe(image_folder)

In [26]:
df_updated = augment_and_expand_dataset(df, image_folder, min_instances=20)

In [27]:
df.to_csv("eg_hiero_updated.csv", index=True)

In [28]:
print(df_updated['clase'].value_counts())

clase
N35    448
M17    364
S29    265
X1     232
G43    197
      ... 
F13     20
F9      20
D28     20
G37     20
N19     20
Name: count, Length: 172, dtype: int64


In [29]:
df_updated

Unnamed: 0_level_0,clase,pixel_0,pixel_1,pixel_2,pixel_3,pixel_4,pixel_5,pixel_6,pixel_7,pixel_8,...,pixel_3740,pixel_3741,pixel_3742,pixel_3743,pixel_3744,pixel_3745,pixel_3746,pixel_3747,pixel_3748,pixel_3749
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
410275,V13,222,223,225,226,225,223,221,219,225,...,221,221,222,223,223,224,225,225,224,225
220139,N35,209,209,209,209,209,209,209,209,208,...,209,209,209,209,209,209,209,209,206,207
050146,X1,204,203,202,202,201,200,199,199,198,...,203,203,203,203,203,203,203,203,202,202
030018,M17,176,177,179,181,180,179,177,175,177,...,184,183,181,181,183,183,181,180,184,183
220041,N35,201,201,201,200,200,199,199,199,198,...,199,199,198,197,197,196,195,195,196,196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255702,N19,198,198,198,198,198,198,198,198,198,...,198,197,198,197,196,196,198,198,197,196
975452,N19,198,198,198,198,198,198,198,198,198,...,198,197,198,197,196,196,198,198,197,196
030031,N19,198,198,198,198,198,198,198,198,198,...,198,197,198,197,196,196,198,198,197,196
025919,N19,198,198,198,198,198,198,198,198,198,...,198,197,198,197,196,196,198,198,197,196


---

In [30]:
X = df_updated.drop(columns=['clase'])
y = df_updated['clase']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [32]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [33]:
pca = PCA(n_components=0.95)  # Retener el 95% de la varianza
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [34]:
print(f"Componentes principales retenidos: {pca.n_components_}")

Componentes principales retenidos: 426


---

In [35]:
rf_clf = RandomForestClassifier(random_state=42)
rf_clf.fit(X_train_pca, y_train)

In [36]:
y_pred = rf_clf.predict(X_test_pca)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         A55       1.00      0.75      0.86         4
        Aa15       1.00      1.00      1.00         4
        Aa26       1.00      1.00      1.00         4
        Aa27       1.00      1.00      1.00         4
        Aa28       1.00      0.75      0.86         4
          D1       1.00      0.50      0.67         4
         D10       1.00      0.75      0.86         4
        D156       1.00      0.75      0.86         4
         D19       1.00      0.75      0.86         4
          D2       0.00      0.00      0.00         5
         D21       0.64      0.97      0.77        37
         D28       1.00      0.25      0.40         4
         D34       1.00      1.00      1.00         4
         D35       0.92      1.00      0.96        11
         D36       0.73      0.92      0.81        12
         D39       1.00      0.75      0.86         4
          D4       1.00      0.43      0.60         7
         D46       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
# Crear pipeline con PCA y Random Forest
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [38]:
# Rango de búsqueda aleatoria
param_dist = {
    'pca__n_components': [0.8, 0.9, 0.95],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10]
}

In [39]:
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,  # Número de combinaciones aleatorias a probar
    cv=3,       # Validación cruzada
    scoring='accuracy',
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [40]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50, pca__n_components=0.8; total time= 5.3min
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50, pca__n_components=0.8; total time= 5.3min
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=100, pca__n_components=0.8; total time= 5.3min
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=100, pca__n_components=0.8; total time= 5.3min
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50, pca__n_components=0.8; total time= 5.4min
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=100, pca__n_components=0.8; total time= 5.4min
[CV] END classifier__max_depth=10, classifier__min_samples_split=2, classifier__n_estimators=100

In [41]:
print(f"Mejores hiperparámetros: {random_search.best_params_}")

Mejores hiperparámetros: {'pca__n_components': 0.8, 'classifier__n_estimators': 100, 'classifier__min_samples_split': 5, 'classifier__max_depth': None}


In [42]:
best_model = random_search.best_estimator_

In [43]:
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         A55       1.00      0.75      0.86         4
        Aa15       1.00      1.00      1.00         4
        Aa26       1.00      1.00      1.00         4
        Aa27       1.00      1.00      1.00         4
        Aa28       1.00      1.00      1.00         4
          D1       1.00      0.50      0.67         4
         D10       1.00      0.75      0.86         4
        D156       1.00      0.75      0.86         4
         D19       1.00      0.75      0.86         4
          D2       0.33      0.20      0.25         5
         D21       0.70      0.95      0.80        37
         D28       0.33      0.25      0.29         4
         D34       1.00      1.00      1.00         4
         D35       0.92      1.00      0.96        11
         D36       0.75      1.00      0.86        12
         D39       1.00      0.75      0.86         4
          D4       1.00      0.57      0.73         7
         D46       0.80    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
joblib.dump(best_model, "best_random_search.pkl", compress=('gzip', 3))

['best_random_search.pkl']