In [None]:
# Install and import necessary libraries
!pip install pymupdf tensorflow python-docx
import fitz  # PyMuPDF for PDF processing
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import docx
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Function to extract images from a PDF file
def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        pix = page.get_pixmap()
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
        img = tf.image.resize(img, (224, 224)).numpy()  # Resize for model input
        images.append(img[..., :3])  # Ensure 3 channels (RGB)
    return np.array(images) / 255.0  # Normalize images

# Function to extract and preprocess text from a DOCX file
def extract_and_preprocess_text(docx_path, num_samples, max_length=50, vocab_size=10000):
    doc = docx.Document(docx_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    full_text = " ".join(full_text)  # Combine into a single string

    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer.fit_on_texts([full_text])
    sequences = tokenizer.texts_to_sequences([full_text])
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')

    # Ensure the text data matches the number of images
    if len(padded_sequences) < num_samples:
        padded_sequences = np.pad(padded_sequences, [(0, num_samples - len(padded_sequences)), (0, 0)], mode='constant')
    return padded_sequences[:num_samples]

# Build the model with CNN for image features and Transformer for text processing
def build_model(image_shape, vocab_size, max_length):
    base_model = tf.keras.applications.EfficientNetB0(input_shape=image_shape, include_top=False, weights='imagenet')
    base_model.trainable = False
    image_input = layers.Input(shape=image_shape)
    x = base_model(image_input, training=False)
    x = layers.GlobalAveragePooling2D()(x)

    transformer_input = layers.Input(shape=(max_length,))
    y = layers.Embedding(input_dim=vocab_size, output_dim=512)(transformer_input)
    y = layers.GlobalAveragePooling1D()(y)

    combined = layers.Concatenate()([x, y])
    combined = layers.Dense(256, activation='relu')(combined)
    outputs = layers.Dense(vocab_size, activation='softmax')(combined)

    model = models.Model(inputs=[image_input, transformer_input], outputs=outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Paths to your data files
pdf_path = '/content/drive/MyDrive/gs/Padilla - Nobleza virtuosa_testExtract.pdf'
docx_path = '/content/drive/MyDrive/gs/Padilla - 1 Nobleza virtuosa_testTranscription.docx'

# Extract and preprocess data
images = extract_images_from_pdf(pdf_path)
num_samples = images.shape[0]  # Number of images extracted
text_data = extract_and_preprocess_text(docx_path, num_samples, max_length=50, vocab_size=10000)  # Adjust max_length and vocab_size as necessary

# Model parameters
image_shape = (224, 224, 3)
vocab_size = 10000  # This should be adjusted based on the tokenizer's vocabulary size
max_length = 50  # Adjust based on your processed text data

# Build and train the model
model = build_model(image_shape, vocab_size, max_length)
model.fit([images, text_data], np.random.randint(0, vocab_size, size=(num_samples, 1)), epochs=10, batch_size=32)  # Use actual labels in place of random integers


Collecting pymupdf
  Downloading PyMuPDF-1.24.0-cp310-none-manylinux2014_x86_64.whl (3.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.9/3.9 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDFb==1.24.0 (from pymupdf)
  Downloading PyMuPDFb-1.24.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.8/30.8 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx, PyMuPDFb, pymupdf
Successfully installed PyMuPDFb-1.24.0 pymupdf-1.24.0 python-docx-1.1.0
Mounted at /content/drive
Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
Epoch 1/10
