In [11]:
from PIL import Image
import io
import os

# Convert image to binary data
def image_to_binary(image_path):
    with open(image_path, 'rb') as image_file:
        binary_data = image_file.read()
    return binary_data

# Save binary data to a file
def save_binary_data(binary_data, output_path):
    output_directory = os.path.dirname(output_path)
    
    # Check if the output directory exists; if not, create it
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    with open(output_path, 'wb') as file:
        file.write(binary_data)

# Reconstruct image from binary data
def reconstruct_image(binary_data, output_path):
    image = Image.open(io.BytesIO(binary_data))
    image.save(output_path)

# Specify the image file path
image_path = "/home/axat/personal/pySpark/data/Screenshot_20231122_151744.png"

# Convert image to binary data
binary_data = image_to_binary(image_path)

# Save binary data to a file
binary_file_path = "/home/axat/personal/pySpark/output/image_binary.bin"
save_binary_data(binary_data, binary_file_path)

# Reconstruct image from binary file
reconstructed_image_path = "/home/axat/personal/pySpark/output/reconstructed_image.png"
reconstruct_image(binary_data, reconstructed_image_path)


In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BinaryType

# Create a Spark session
spark = SparkSession.builder.appName("ImageBinaryProcessing").getOrCreate()

# Specify the image file path
image_path = "/home/axat/personal/pySpark/data/Screenshot_20231122_151744.png"

# Define the schema for the DataFrame
schema = StructType([StructField("FileName", StringType(), True),
                     StructField("ImageData", BinaryType(), True)])

# Use the binaryFiles function to read binary data directly into a DataFrame
image_df = spark.read.format("binaryFile").option("pathGlobFilter", "*.png").load(image_path)

# Show the DataFrame
# image_df.schema()
image_df.printSchema()
# image_df
image_df.show()


# Stop the Spark session
spark.stop()


root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)



                                                                                

+--------------------+--------------------+-------+--------------------+
|                path|    modificationTime| length|             content|
+--------------------+--------------------+-------+--------------------+
|file:/home/axat/p...|2023-11-22 15:17:...|2215342|[89 50 4E 47 0D 0...|
+--------------------+--------------------+-------+--------------------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BinaryType
from PIL import Image
import io
import base64

# Create a Spark session
spark = SparkSession.builder.appName("ImageBinaryProcessing").getOrCreate()

# Specify the image file path
image_path = "/home/axat/personal/pySpark/data/Screenshot_20231122_151744.png"

# Define the schema for the DataFrame
schema = StructType([StructField("FileName", StringType(), True),
                     StructField("ImageData", BinaryType(), True)])

# Use the binaryFiles function to read binary data directly into a DataFrame
image_df = spark.read.format("binaryFile") \
    .option("pathGlobFilter", "*.png") \
    .load(image_path) \
    .withColumnRenamed("path", "FileName") \
    .select("FileName", "content")

# Show the DataFrame
image_df.show(truncate=False)

# Convert hexadecimal string to binary data
def hex_to_binary(hex_string):
    return bytes.fromhex(hex_string)

# Decode base64-encoded binary data and reconstruct the original image
def reconstruct_image(row):
    file_name = row["FileName"]
    hex_data = row["content"]
    binary_data = hex_to_binary(hex_data)
    output_path = f"/home/axat/personal/pySpark/output/reconstructed_{file_name}"

    with open(output_path, "wb") as file:
        file.write(binary_data)

# Use map transformation instead of foreach
reconstructed_images_df = image_df.rdd.map(reconstruct_image).toDF()

# Write the reconstructed images to the output directory
reconstructed_images_df.write.mode("overwrite").parquet("/home/axat/personal/pySpark/output/reconstructed_images")

# Stop the Spark session
spark.stop()
