## Prequisites

In [51]:
!pip install pyspark py4j pillow



### Extract metadata

In [None]:
import requests, json, csv

key = 'TOUR_API_KEY'
num_of_rows = '15'
fname_metadata = 'tour_photo.csv'

# Request parameters
base_url = 'http://apis.data.go.kr/B551011/PhotoGalleryService1/galleryList1'
params = {
    'numOfRows': num_of_rows,
    'pageNo': '1',
    'MobileOS' : 'ETC',
    'MobileApp' : 'AppTest',
    'arrange' : 'A',
    '_type' : 'json',
    'serviceKey' : key}

# Get request
response= requests.get(base_url,params=params).json()['response']

# Save as csv file
with open(fname_metadata, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=response[0].keys())
    writer.writeheader()
    writer.writerows(response['body']['items']['item'])


In [None]:
import requests
import csv

save_folder = "tour_photos"

with open(fname_metadata, 'r', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  # skip header

    for row in csv_reader:
        img_url = row['galWebImageUrl']  # Parse image url
        img_name = img_url.split('/')[-1]  # Extract filename

        try:
            response = requests.get(img_url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Error downloading image from {img_url}: {e}")
            continue

        # Save
        with open(f"{save_folder}/{img_name}", 'wb') as img_file:
            img_file.write(response.content)


## Exploration

In [None]:
import pandas as pd

df = pd.read_csv(fname_metadata)
df.head()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5525 entries, 0 to 5524
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   galContentId            5525 non-null   int64  
 1   galContentTypeId        5525 non-null   int64  
 2   galTitle                5525 non-null   object 
 3   galWebImageUrl          5525 non-null   object 
 4   galCreatedtime          5525 non-null   int64  
 5   galModifiedtime         5525 non-null   int64  
 6   galPhotographyMonth     5524 non-null   float64
 7   galPhotographyLocation  5510 non-null   object 
 8   galPhotographer         5524 non-null   object 
 9   galSearchKeyword        5523 non-null   object 
dtypes: float64(1), int64(4), object(5)
memory usage: 431.8+ KB


## Preprocessing

### Imports

In [93]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import BinaryType, IntegerType, StringType, BooleanType
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.functions import min as sql_min
from PIL import Image
import numpy as np
import io

### Create Spark Session

In [4]:
# SparkSession
spark = (
    SparkSession.builder
        .master('local[*]')
        .appName('PhotoAnalyzer')
        .getOrCreate()
    )

### Load images

In [6]:
# Load images to spark dataframe
df_img = spark.read.format('image').load(save_folder)

In [None]:
df_img.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)



### Validate images

In [91]:
# Image verifying UDF using PIL.Image
@udf(returnType=BooleanType())
def validate_image(image_path):
    try:
        # Trim first 7 chars "file://" and load.
        img = Image.open(image_path[7:])
        # An error will occur if it's broken.
        img.verify()
        return True
    except :
        # if return is False, it will be dropped.
        return False

previous_size = df_img.count()

# Drop broken images
df_img = df_img.filter(validate_image(col('image.origin')))
print(f'Previous dataframe\'s size : {previous_size}')
print(f'Filtered dataframe\'s size : {df_img.count()}')

Previous dataframe's size : 15
Filtered dataframe's size : 8


### Resize

In [92]:
@udf(returnType=IntegerType())
def get_min(image : Row):
    return min(image['width'], image['height'])

minimum_length = (
    df_img
        .select(get_min("image").alias("_"))
        .select(sql_min('_'))
        .first()[0]
    )

print(minimum_length)

532


In [94]:
def bgr2rgb(img_array : np.ndarray) -> np.ndarray:
    # Reverse the order of elements
    return img_array[..., ::-1]

def rgb2bgr(img_array: np.ndarray) -> np.ndarray:
    # Reverse the order of elements
    return img_array[..., ::-1]

def data2arr(data : bytearray, width : int, height : int) -> np.ndarray:
    return (
        np
            .frombuffer(bytes(data), np.uint8)
            .reshape((height,width,3))
    )
    return nparr

def arr2data(nparr: np.ndarray) -> bytearray:
    return bytearray(nparr.flatten())

@udf(returnType=StringType())
def parse_filename(image : Row):
    # Trim out the path and ext.
    return image['origin'].split('/')[-1][:-4]

@udf(returnType=BinaryType())
def crop(row : Row, x : int = minimum_length) -> BinaryType():
    w,h = row['width'],row['height']
    # Calculate the cordinates to trim
    l,t,r,b = (w - x) / 2, (h - x) / 2, (w + x) / 2, (h + x) / 2
    # Read bytearray as numpy array
    nparr = data2arr(row['data'],w,h)
    # Change the order of the color channels for PIL
    nparr = bgr2rgb(nparr)
    # Load image
    image = Image.fromarray(nparr, mode='RGB')
    # Crop
    image_cropped = image.crop((l,t,r,b))
    # Encode the image as bytearray again
    nparr_c = np.array(image_cropped)
    nparr_c = rgb2bgr(nparr_c)
    result = arr2data(nparr_c)
    return result
df_resized = (
    df_img
        .select(
            parse_filename('image').alias('id'),
            crop('image').alias('data'),
            lit(minimum_length).alias('size'),
            lit('BGR').alias('mode')
            )
)
df_resized.show()



+-------+--------------------+----+----+
|     id|                data|size|mode|
+-------+--------------------+----+----+
|3106626|[CE CC 8B BF BD 7...| 532| BGR|
|3108458|[ED E5 DE EE E6 D...| 532| BGR|
|3107008|[68 51 CB 8E 77 E...| 532| BGR|
|3106911|[02 3D 7B 01 3E 7...| 532| BGR|
|3106847|[5B 2F 00 5D 31 0...| 532| BGR|
|3106888|[02 05 49 02 05 4...| 532| BGR|
|3108341|[E7 E4 DC E7 E4 D...| 532| BGR|
|3106632|[4F 01 E8 50 01 E...| 532| BGR|
+-------+--------------------+----+----+



### Conversion into gray scale