In [1]:
# Import necessary libraries

# Install necessary libraries
!pip install pymupdf
!pip install python-docx
!pip install tensorflow

# Import necessary libraries
import fitz  # PyMuPDF
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dense, LSTM, Bidirectional, Reshape
from tensorflow.keras.models import Model
from docx import Document
import cv2
import io
from PIL import Image
import fitz  # PyMuPDF for handling PDF files
import numpy as np
import cv2  # OpenCV for image processing
from docx import Document  # python-docx for handling DOCX files
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from google.colab import drive


# Mount Google Drive
drive.mount('/content/drive')

#Extracts images from a PDF file and resizes them.
def extract_images_from_pdf(pdf_path, target_size=(128, 128)):
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        pix = page.get_pixmap()
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
        if pix.n == 4:  # Convert RGBA to GRAYSCALE if needed
            img = cv2.cvtColor(img, cv2.COLOR_RGBA2GRAY)
        img_resized = cv2.resize(img, target_size)  # Resize image to the target size
        images.append(img_resized)
    doc.close()
    return np.array(images).reshape(-1, target_size[0], target_size[1], 1)  # Ensure consistent shape

#Extracts text from a DOCX file.
def extract_text(docx_path):
    doc = Document(docx_path)
    full_text = ' '.join([para.text for para in doc.paragraphs])
    return full_text

#Encodes text into sequences for neural network input
def encode_text(text, max_length=100):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True, oov_token='UNK')
    tokenizer.fit_on_texts([text])
    sequence = tokenizer.texts_to_sequences([text])[0]
    padded = tf.keras.preprocessing.sequence.pad_sequences([sequence], maxlen=max_length, padding='post')
    return padded, tokenizer.word_index, len(tokenizer.word_index) + 1  # +1 for padding token

# Builds a Convolutional Recurrent Neural Network model
def build_crnn(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_layer)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(1024, activation='relu')(x)
    output_layer = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# file paths
pdf_path = '/content/drive/MyDrive/gs/Padilla - Nobleza virtuosa_testExtract.pdf'
docx_path = '/content/drive/MyDrive/gs/Padilla - 1 Nobleza virtuosa_testTranscription.docx'

# Process the data
images = extract_images_from_pdf(pdf_path)
text = extract_text(docx_path)

# Calculate num_classes for the unique characters in the text
encoded_text, word_index, num_classes = encode_text(text)
preprocessed_images = images / 255.0  # Normalize images

# Prepare the model
model = build_crnn((128, 128, 1), num_classes)

# Prepare the training data and labels
# Assuming a simplified scenario with one sample for demonstration
X_train = preprocessed_images[:1]  # Use the first image
# One-hot encode the label for the first character sequence in the text
y_train = to_categorical(encoded_text[0], num_classes=num_classes)[0]  # No need for additional reshaping

# Train the model
model.fit(X_train, y_train[np.newaxis, :], epochs=10, batch_size=1)

print("Training complete.")

Collecting pymupdf
  Downloading PyMuPDF-1.24.0-cp310-none-manylinux2014_x86_64.whl (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.0 (from pymupdf)
  Downloading PyMuPDFb-1.24.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.0 pymupdf-1.24.0
Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0
Mounted at /content/drive
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
E