In [2]:
!pip install pymupdf tensorflow transformers
!pip install python-docx
# Install necessary libraries
!pip install pymupdf tensorflow transformers python-docx

import fitz  # PyMuPDF
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, LayerNormalization, Embedding
from tensorflow.keras.models import Model
from transformers import BertTokenizer, TFBertModel
import docx
from google.colab import drive

drive.mount('/content/drive')

def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page in doc:
        pix = page.get_pixmap()
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
        img = tf.image.resize(img, (224, 224))
        images.append(img[..., :3])
    return tf.convert_to_tensor(images) / 255.0  # Convert list to tensor and normalize

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    full_text = [para.text for para in doc.paragraphs]
    return " ".join(full_text)

# Adapted for a simple example of self-supervised learning: pretext task - image reconstruction
def build_autoencoder(image_shape=(224, 224, 3)):
    input_img = Input(shape=image_shape)
    # Encoder
    x = tf.keras.layers.Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
    x = tf.keras.layers.MaxPooling2D((2, 2), padding='same')(x)
    x = tf.keras.layers.Conv2D(8, (3, 3), activation='relu', padding='same')(x)
    encoded = tf.keras.layers.MaxPooling2D((2, 2), padding='same')(x)
    # Decoder
    x = tf.keras.layers.Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
    x = tf.keras.layers.UpSampling2D((2, 2))(x)
    x = tf.keras.layers.Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    x = tf.keras.layers.UpSampling2D((2, 2))(x)
    decoded = tf.keras.layers.Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)
    autoencoder = Model(input_img, decoded)
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
    return autoencoder

pdf_path = '/content/drive/MyDrive/gs/Padilla - Nobleza virtuosa_testExtract.pdf'
images = extract_images_from_pdf(pdf_path)

# Build and train the autoencoder
autoencoder = build_autoencoder()
autoencoder.fit(images, images, epochs=10, batch_size=32)  # Using images as both input and target

# This autoencoder is a simplistic example of a self-supervised learning model focusing on image data.


Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx
Successfully installed python-docx-1.1.0
Mounted at /content/drive
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78e4c85d7250>

In [3]:
!pip install pymupdf tensorflow transformers python-docx

import fitz  # For PDF processing
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape, Conv2D, MaxPooling2D, UpSampling2D
from tensorflow.keras.models import Model
from transformers import BertTokenizer, TFBertModel
import docx

# Assuming the drive is already mounted
# Define paths to your PDF and DOCX files
pdf_path = '/content/drive/MyDrive/gs/Padilla - Nobleza virtuosa_testExtract.pdf'
docx_path = '/content/drive/MyDrive/gs/Padilla - 1 Nobleza virtuosa_testTranscription.docx'

def extract_images_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for i, page in enumerate(doc):
        pix = page.get_pixmap()
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
        img = tf.image.resize(img, (128, 128))
        img = img.numpy()  # Convert tensor to numpy array
        if pix.n == 4:  # Convert RGBA to RGB
            img = img[..., :3]
        images.append(img)
    return np.array(images) / 255.0

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    full_text = [para.text for para in doc.paragraphs]
    return " ".join(full_text)

# Simple autoencoder architecture for demonstration
def build_autoencoder():
    input_img = Input(shape=(128, 128, 3))
    x = Conv2D(16, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(8, (3, 3), activation='relu', padding='same')(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)

    x = Conv2D(8, (3, 3), activation='relu', padding='same')(encoded)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(16, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)

    autoencoder = Model(input_img, decoded)
    autoencoder.compile(optimizer='adam', loss='binary_crossentropy')
    return autoencoder

images = extract_images_from_pdf(pdf_path)
autoencoder = build_autoencoder()
autoencoder.fit(images, images, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x78e4c96a2170>