In [1]:
import os
import json
from pathlib import Path
import urllib

from bs4 import BeautifulSoup
import dotenv
from pprint import pprint
from tqdm import tqdm

## OpenAI Vision API setup

In [2]:
# load the environmenal variables from the .env file
dotenv.load_dotenv()
print(os.environ['OPENAI_API_KEY'])

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


In [3]:
from openai import OpenAI

client = OpenAI()

def image_understanding(url: str) -> str:
    try:
        response = client.chat.completions.create(
        model="gpt-4-vision-preview",
        messages=[
            {
            "role": "user",
            "content": [
                {"type": "text", 
                "text": "The image is from the website of National Park Service (NPS) in the United States. What’s in this image? Please describe it within 200 characters."},
                {
                "type": "image_url",
                "image_url": {
                    "url": url,
                },
                },
            ],
            }
        ],
        max_tokens=200,
        )
        return response.choices[0].message.content
    except:
        return None

    

In [4]:
url = "https://www.nps.gov/common/uploads/grid_builder/grca/crop16_9/47171AFE-A3D5-61C7-1C10631E42EFDF8F.jpg?width=640&quality=90&mode=crop"
img_text  = image_understanding(url)

In [5]:
img_text

'A campsite with a tent connected to a vehicle, surrounded by trees and rocks, indicating a blend of nature and modern camping amenities.'

In [6]:
# url = "https://www.nps.gov/grca/index.htm"

def get_images(url: str, img_formats: list=['jpg', 'jpeg']) -> dict:
    try:
        f = urllib.request.urlopen(url)
        page = f.read()
        f.close()

        soup = BeautifulSoup(page)
        
        imgs = {}
        for link in soup.findAll('img'):
            img_link = "https://www.nps.gov"+str(link.get('src'))

            if any(x in img_link for x in img_formats):
                img_text = image_understanding(img_link)
                if img_text:
                    imgs[img_link] = img_text.replace("\n", "")+'\n'
                    # print(img_link)
                    # pprint(img_text)

        return imgs
    except:
        return None

In [7]:
# images = get_images(url)

## Data

In [8]:
# file_path='./output_grca_clean.json'
file_path='./output_all_parks_clean.json'

In [9]:
data = json.loads(Path(file_path).read_text())

In [10]:
len(data)

99873

In [11]:
data[:1]

[{'html': 'Opinion: Think about ways men and women on San Nicolas Island might have shared the division of labor. Do you agree or disagree with the custom that women in Karana’s tribe were not allowed to make weapons? Remember to consider what it might have been like to live in her particular society. Use reasons and information to support your point of view.\nInformative/explanatory: Describe the mating and breeding habits of northern elephant seals. Gather information from Voices from the Field to develop the topic. Draw evidence from informational text.\nNarrative: Imagine you are Karana watching a fight between two bull elephant seals. Write a narrative describing the battle. Include details about what you might see, hear, and smell.\nAn official form of the United States government. Provided by Touchpoints\nDownload the official NPS app before your next visit',
  'title': 'Teacher Resources: Chapter 13 - Island of the Blue Dolphins (U.S. National Park Service)',
  'url': 'https://

In [12]:
f = open("./park_list.txt", "r")
park_list = f.readlines()
parks = {}
for p in park_list:
    park_split = p.split("\"")
    s = park_split[2]
    parks[park_split[1]] = s[s.find(">")+1:s.find("</")]
f.close()

In [13]:
pprint(parks)

{'abli': 'Abraham Lincoln Birthplace National Historical Park',
 'acad': 'Acadia National Park',
 'adam': 'Adams National Historical Park',
 'afam': 'African American Civil War Memorial ',
 'afbg': 'African Burial Ground National Monument',
 'agfo': 'Agate Fossil Beds National Monument',
 'alag': 'Alagnak Wild River',
 'alca': 'Alcatraz Island ',
 'aleu': 'Aleutian Islands World War II National Historic Area',
 'alfl': 'Alibates Flint Quarries National Monument',
 'alka': 'Ala Kahakai National Historic Trail',
 'alpo': 'Allegheny Portage Railroad National Historic Site',
 'amch': 'Amache National Historic Site',
 'amis': 'Amistad National Recreation Area',
 'amme': 'American Memorial Park',
 'anac': 'Anacostia Park',
 'anch': 'Alaska Public Lands ',
 'ande': 'Andersonville National Historic Site',
 'ania': 'Aniakchak National Monument & Preserve',
 'anjo': 'Andrew Johnson National Historic Site',
 'anti': 'Antietam National Battlefield',
 'apco': 'Appomattox Court House National Histor

## OpenAI Vision API calling

In [14]:
from random import sample

# for i in range(len(data[:200])):
# for i in tqdm(range(len(data))):
for i in tqdm(sample(list(range(len(data))), 2500)):
    if any ( "/"+x+"/" in data[i]['url'] for x in parks.keys()):
        # print(data[i]['url'])
        images = get_images(data[i]['url'])
        if images:
            data[i]['images'] = {}
            for key, value in images.items():
                data[i]['images'].update({key: value})
                # print(key+": "+value)
    

100%|██████████| 2500/2500 [2:45:17<00:00,  3.97s/it]  


In [15]:
# save data into a json file
file_path_img = file_path.replace(".json", "_img.json")

with open(file_path_img, "w") as fout:
    json.dump(data, fout, indent=4)