# SceneXplain and geospatial data

## 1. Download data

1. Download the [UC Merced Land Use dataset](http://weegee.vision.ucmerced.edu/datasets/landuse.html) and extract info the `data` directory
2. Convert the TIF files to JPG format (`mogrify -format jpg */*.tif`)

## 2. Set options below

In [3]:
# SceneXplain opitons
SCENEX_SECRET = 'p85wODuVWnHRULoOar75:ac55a397a4360e312fcf4a3c94704ff27a2dcc0eee97b963d42c01494713e28d'
FEATURES = ['json', 'high_quality']

# Dataset options
DATASET = "./data/UCMerced_LandUse/UCMerced_LandUse"
MAX_COUNT = 20

In [4]:
# from pprint import pprint
import glob
import json
import os
import base64
import http

In [14]:
from random import sample

image_folder = f'{DATASET}/Images'
image_files = glob.glob(f'{image_folder}/**/*.jpg')
shuffled_files = sample(image_files, MAX_COUNT)

In [15]:
headers = {
    "x-api-key": f"token {SCENEX_SECRET}",
    "content-type": "application/json",
}

def image_to_data_uri(file_path):
    with open(file_path, "rb") as image_file:
        encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
        return f"data:image/jpeg;base64,{encoded_image}"


def process_image(filename, schema, features=FEATURES):
    print(f'Processing {filename}')
    data = {
        "data": [
            {"image": image_to_data_uri(filename), "features": features, "json_schema": json.dumps(schema)},
        ]
    }
    
    connection = http.client.HTTPSConnection("api.scenex.jina.ai")
    connection.request("POST", "/v1/describe", json.dumps(data), headers)
    response = connection.getresponse()
    
    response_data = response.read().decode("utf-8")
    response_json = json.loads(response_data)
    
    connection.close()

    return response_json

In [8]:
def batch_process(image_list, schema, features=FEATURES):
    data = {}
    data['data'] = []
    categories = []
    for image_filename in image_list:
        print(f"Processing {image_filename}")
        category = image_filename.split("/")[-2]
        categories.append(category)
        data['data'].append(
            {"image": image_to_data_uri(image_filename), "features": features, "json_schema": json.dumps(schema)}
        )

    headers = {
        "x-api-key": f"token {SCENEX_SECRET}",
        "content-type": "application/json",
    }

    print("Sending payload")
    connection = http.client.HTTPSConnection("api.scenex.jina.ai")
    connection.request("POST", "/v1/describe", json.dumps(data), headers)
    response = connection.getresponse()
    
    response_data = response.read().decode("utf-8")
    response_json = json.loads(response_data)
    
    connection.close()

    return response_json

In [9]:
# Create JSON schema - since each dataset may have different tags, we have to create it dynamically

category_dirs = os.listdir(image_folder)

with open('base_schema.json') as file:
    schema = json.loads(file.read())
    schema['properties']['category']['enum'] = category_dirs

In [None]:
successes = []
fails = []
errors = []

for filename in shuffled_files:
    category_name = filename.split('/')[-2]
    image_data = process_image(filename=filename, schema=schema)
    try:
        scenex_category = json.loads(image_data['result'][0]['i18n']['en'])['category'][0]
    except:
        scenex_category = "error processing"
        
    data = {
        "image_path": filename,
        "target_category": category_name,
        "scenex_category": scenex_category
    }

    if data['target_category'] == data['scenex_category']:
        data['match'] = True
        successes.append(data)
        print("\t✅ Successful match!")
    elif data['scenex_category'] == 'error processing':
        errors.append(data)
        print("\t😭 Error")
    else:
        data['match'] = False
        fails.append(data)
        print(f"\t❌ Failed match! (Identified {data['target_category']} as {data['scenex_category']})")

Processing ./data/UCMerced_LandUse/UCMerced_LandUse/Images/freeway/freeway74.jpg
	✅ Successful match!
Processing ./data/UCMerced_LandUse/UCMerced_LandUse/Images/tenniscourt/tenniscourt48.jpg
	✅ Successful match!
Processing ./data/UCMerced_LandUse/UCMerced_LandUse/Images/mobilehomepark/mobilehomepark89.jpg
	❌ Failed match! (Identified mobilehomepark as sparseresidential)
Processing ./data/UCMerced_LandUse/UCMerced_LandUse/Images/buildings/buildings98.jpg
	✅ Successful match!
Processing ./data/UCMerced_LandUse/UCMerced_LandUse/Images/sparseresidential/sparseresidential97.jpg
	❌ Failed match! (Identified sparseresidential as forest)
Processing ./data/UCMerced_LandUse/UCMerced_LandUse/Images/sparseresidential/sparseresidential69.jpg
	❌ Failed match! (Identified sparseresidential as forest)
Processing ./data/UCMerced_LandUse/UCMerced_LandUse/Images/harbor/harbor56.jpg
	✅ Successful match!
Processing ./data/UCMerced_LandUse/UCMerced_LandUse/Images/forest/forest75.jpg
	😭 Error
Processing ./da

In [12]:
# what percent did we get right?
len(successes)/(len(successes)+len(fails))

0.8