In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.model_selection import train_test_split
from PIL import Image
import fitz  # PyMuPDF
import io

In [2]:
# Function to load and preprocess images
def load_image(file_path, target_size=(128, 128)):
    img = load_img(file_path, target_size=target_size)
    img = img_to_array(img)
    img = img / 255.0  # Normalize pixel values to [0, 1]
    return img

# Function to load text files
def load_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            text = file.read()
    return text

# Function to encode text data to fixed-size vector
def encode_text(text, max_length=500):
    # Simple character-level encoding for demonstration
    text = text[:max_length].ljust(max_length)
    encoded = [ord(char) for char in text]
    return np.array(encoded)

In [3]:
# Function to build the CNN model
def build_image_model():
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(2, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model
    

In [4]:
# Function to build the text model
def build_text_model(input_shape):
    model = Sequential([
        Dense(64, activation='relu', input_shape=input_shape),
        Dropout(0.5),
        Dense(2, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [5]:
# Function to prepare data from the directories
def prepare_data(image_dir, text_dir):
    image_paths = [os.path.join(image_dir, filename) for filename in os.listdir(image_dir) if filename.endswith(('.png', '.jpg', '.jpeg'))]
    text_paths = [os.path.join(text_dir, filename) for filename in os.listdir(text_dir) if filename.endswith('.txt')]

    images = np.array([load_image(path) for path in image_paths])
    texts = np.array([encode_text(load_text(path)) for path in text_paths])

    # Create labels: 0 for images, 1 for texts
    image_labels = np.zeros(len(images))
    text_labels = np.ones(len(texts))

    return images, image_labels, texts, text_labels

In [6]:
# Paths to your data directories
base_dir = r'C:\Users\MSI\classify-img-txt\data'
image_dir = os.path.join(base_dir, 'images')
text_dir = os.path.join(base_dir, 'text')


In [7]:
# Prepare data
images, image_labels, texts, text_labels = prepare_data(image_dir, text_dir)




In [8]:
# Train image model
x_train_img, x_test_img, y_train_img, y_test_img = train_test_split(images, image_labels, test_size=0.2, random_state=42)
image_model = build_image_model()
image_model.fit(x_train_img, y_train_img, epochs=10, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 225ms/step - accuracy: 0.6810 - loss: 0.3169 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 141ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 140ms/step - accuracy: 1.0000 - loss: 1.5979e-08 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 140ms/step - accuracy: 1.0000 - loss: 2.5774e-09 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 133ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 133ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e

<keras.src.callbacks.history.History at 0x2407707c1c0>

In [9]:
# Train image model
x_train_img, x_test_img, y_train_img, y_test_img = train_test_split(images, image_labels, test_size=0.2, random_state=42)
image_model = build_image_model()
image_model.fit(x_train_img, y_train_img, epochs=10, validation_split=0.2)

Epoch 1/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 187ms/step - accuracy: 0.9003 - loss: 0.2066 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 149ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 149ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 158ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 143ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 133ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e

<keras.src.callbacks.history.History at 0x2407a46d960>

In [10]:
# Train text model
x_train_txt, x_test_txt, y_train_txt, y_test_txt = train_test_split(texts, text_labels, test_size=0.2, random_state=42)
text_model = build_text_model((texts.shape[1],))
text_model.fit(x_train_txt, y_train_txt, epochs=10, validation_split=0.2)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9036 - loss: 42.2014 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9945 - loss: 1.2880 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 3/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 5/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9961 - loss: 0.0964 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 6/10
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 7/10
[1

<keras.src.callbacks.history.History at 0x2407707d150>

In [11]:
# Function to encode image data to binary
def encode_image_to_binary(data):
    data = (data * 255).astype(np.uint8)  # Convert to uint8
    binary_data = ''.join(format(byte, '08b') for byte in data.flatten())
    return binary_data

In [12]:
# Function to encode text data to binary
def encode_text_to_binary(data):
    binary_data = ''.join(format(byte, '08b') for byte in data)
    return binary_data

In [13]:
def extract_from_pdf(file_path):
    pdf_document = fitz.open(file_path)
    images = []
    texts = []
    
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        
        # Extract text
        text = page.get_text()
        if text:
            texts.append(text)
        
        # Extract images
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))
            image = image.convert('RGB')  # Ensure image is in RGB format
            images.append(image)
    
    return images, texts


In [16]:
# Function to classify and encode content from a PDF
def classify_and_encode_pdf(file_path):
    images, texts = extract_from_pdf(file_path)
    binary_data = ""
    
    for image in images:
        image = image.resize((128, 128))
        data = img_to_array(image) / 255.0
        data = np.expand_dims(data, axis=0)
        label = np.argmax(image_model.predict(data), axis=1)[0]
        binary_data += encode_image_to_binary(data)
    
    for text in texts:
        encoded_data = encode_text(text)
        data = np.expand_dims(encoded_data, axis=0)
        label = np.argmax(text_model.predict(data), axis=1)[0]
        binary_data += encode_text_to_binary(encoded_data)
    
    return binary_data

In [21]:
# Function to save binary data to a text file
def save_binary_data_to_file(file_path, binary_data):
    # Generate a valid file name based on the PDF file name
    file_name = os.path.basename(file_path)
    base_name, _ = os.path.splitext(file_name)
    binary_file_name = base_name + '.txt'  # Use original file name with .txt extension
    output_dir = r'C:\Users\MSI\classify-img-txt\output'
    os.makedirs(output_dir, exist_ok=True)
    output_file_path = os.path.join(output_dir, binary_file_name)
    
    try:
        with open(output_file_path, 'w') as f:
            f.write(binary_data)
        print(f"Binary data saved to: {output_file_path}")
    except Exception as e:
        print(f"Error saving binary data to file: {e}")

In [23]:
# Example usage
file_path = r'C:\Users\MSI\Downloads\meiser_DNA_storage_2019 (1).pdf'  # Change to the path of the PDF you want to process
binary_data = classify_and_encode_pdf(file_path)
save_binary_data_to_file(file_path, binary_data)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15