<a href="https://colab.research.google.com/github/a-donat/Useful_Templates/blob/main/Template_Compile_Image_URLs_and_Download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Template created by Annalisa Donat

This is a template that can be adapted to perform the following tasks:
1. (Executes concurrently with step 2) Scrape the html for a website (or websites) for image urls matching html-related criteria.
2. Create a csv file containing the image url, an assigned id, and any other data pertaining to the image.
3. Download the images.



# Parts 1 and 2: Compile and Log Image URLs

### Import Libraries

In [22]:
from typing import *
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [None]:
# ONLY RUN IF LOGGING TIME PULLED
from datetime import datetime


### Define Functions

In [17]:
def wrapped_find_first(pattern: str | re.Pattern, full_str: str) -> str:
  '''Returns first instance of pattern found if pattern found in full_str,
  else returns empty string'''
  try:
    return re.findall(pattern, full_str)[0]
  # If pattern is not found in full_str, it will raise an IndexError
  except IndexError:
    return ''


def wrapped_find_all(pattern: str | re.Pattern, full_str: str) -> str:
  '''Returns first instance of pattern found if pattern found in full_str,
  else returns empty list'''
  try:
    return re.findall(pattern, full_str)
  # If pattern is not found in full_str, it will raise an IndexError
  except IndexError:
    return []

**Notes on Regex**<br>
To view the html of a website in chrome, control-click anywhere and select "view page source" on the drop-down menu that appears.

Tool for debugging regex: https://regex101.com/. Remember to select Python under flavor menu on the left.<br><br> example regex: "first_part(.+?)last_part"

In [None]:
def placeholder_id_generator():
  # ADD YOUR CODE HERE
  pass

def parse_webpage(webpage_soup: BeautifulSoup | str) -> List[List[str]]:
  photo_urls = wrapped_find_all('PHOTO_URL_REGEX_HERE', str(webpage_soup))
  photo_ids = placeholder_id_generator()
  # OPTIONAL, PARSE OTHER INFO:
  info_1 = wrapped_find_all('INFO1_REGEX_HERE', str(webpage_soup))
  info_2 = wrapped_find_all('INFO2_REGEX_HERE', str(webpage_soup))
  # ADD CODE HERE TO CHECK THAT ALL LISTS ABOVE HAVE SAME LENGTH:
  return [photo_ids, photo_urls, info_1, info_2]

In [None]:
def iterate_through_search_results(results_url: str, n_pages: int = 1,
                    pages_offset = 0,
                    max_n_pages_scrape: int = 100):
  """
  Inputs
  ------
  results_url: str
  n_pages >= 1
  pages_offset >= 0
  max_n_pages_scrape
    if max_pages_scape > n_pages, then max_n_pages_scrape == num pages pulled
    if n_pages <= max_n_pages_scrape, then n_pages == num pages pulled
  """
  i = 0
  results_dict = {"image_ids": [], "image_urls": [], "info_1": [], "info_2": []}
  # CAUTION: MUST BE SAME ORDER THAT parse_webpage() RETURNS IN
  results_vars = ["image_ids", "image_urls", "info_1", "info_2"]

  while i < min(n_pages, max_n_pages_scrape):
    print(".  ", i)
    offset = "?start=%d&tab=" % ((i+pages_offset) * 30)
    page_url = results_url.replace("?tab=", offset)
    page_html = requests.get(page_url)
    page_soup = BeautifulSoup(page_html.text, "html.parser")
    metadata = parse_webpage(page_soup)
    for var_ind, var_name in enumerate(results_vars):
      results_dict[var_name] += metadata[var_ind]
    i += 1

  search_results_df = pd.DataFrame.from_dict(results_dict)
  # could move to make more granular, or simply delete:
  search_results_df["date_logged"] = datetime.now()
  return search_results_df

In [5]:
temp_soup = BeautifulSoup(requests.get(
    "https://www.vivino.com/US-NY/en/batasiolo-barolo/w/77756?year=2012",
    headers={"User-Agent": "Mozilla/5.0"}).text, "html.parser")

### Compile and Log

don't forget to use .to_csv()

# Part 3: Download Photos

### Import Libraries
(assumes libraries needed for parts 1 and 2 have been imported)

In [19]:
from PIL import Image
from os import mkdir

### Define Functions

In [21]:
url_template = "URL_TEMPLATE_HERE"


def download_image_opt1(image_id: str, folder_root: str):
  """
  ex. url_template = 'https://s3-media0.fl.yelpcdn.com/bphoto/photo_id/258s.jpg'
  """
  img_content = requests.get(
    url_template.replace("photo_id", image_id)).content
  with open("/content/%s/%s_%s.jpg" % (
      folder_root, folder_root, image_id), "wb") as handler:
    handler.write(img_content)


def download_image_opt2(image_url: str, image_id: str, folder_root: str):
  img_content = requests.get(image_url).content
  with open("/content/%s/%s_%s.jpg" % (
      folder_root, folder_root, image_id), "wb") as handler:
    handler.write(img_content)

In [20]:
def resize_image(photo_id: str, orig_root_folder: str, dest_root_folder: str,
                 new_width: int = 80, new_height: int = 80):
  img_orig = Image.open("/content/%s/%s_%s.jpg" % (
      orig_root_folder, orig_root_folder, photo_id))

  resized_img = img_orig.resize((new_width, new_height))

  resized_img.save("/content/%s/%s_%s.jpg" % (
      dest_root_folder, dest_root_folder, photo_id))

### Download Images

In [None]:
# EXAMPLE
'''
image_pull_dates = {}
for x in my_df["image_id"]:
  download_image_opt1(x, "my_folder_root")
  image_pull_dates[x] = datetime.now()
  # IF LOGGING PULL TIMES, THEN SAVE THIS DATA SOMEWHERE
  print(x)'''

In [None]:
# EXAMPLE
!zip -r 'my_folder_root.zip' '/content/my_folder_root'
# DON'T FORGET TO DOWNLOAD TO DISC OR UPLOAD TO DRIVE