### This notebook shows an example of scraping images from websites, which I am using for academic research for my master thesis.

### 1 - Import data and links

In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv("all_data.csv")

In [3]:
data=data.iloc[:,0:-4] # exclude last 4 columns with text data

In [4]:
data.head(2)

Unnamed: 0,ID,Year,Season,Collection Name,Designer ID,Designer Name,Collection link
0,1,2010,spring,spring-2010-ready-to-wear,1,alexander-mcqueen,https://www.vogue.com/fashion-shows/spring-201...
1,2,2011,spring,spring-2011-ready-to-wear,1,alexander-mcqueen,https://www.vogue.com/fashion-shows/spring-201...


#### Create template filenames for the images

In [5]:
data['File_names'] = [str(x) + '-' + y+ '-' + z +'-'
                      for x, y, z in zip(data['Year'], data['Season'],data['Designer Name'])]

In [6]:
data.iloc[1]["File_names"]

'2011-spring-alexander-mcqueen-'

In [7]:
data.iloc[1]["Collection link"]

'https://www.vogue.com/fashion-shows/spring-2011-ready-to-wear/alexander-mcqueen'

#### Create links that lead directly to the images

In [8]:
data["Images link"]=  data["Collection link"].apply(lambda x: str(x) + "#collection")

In [9]:
data.iloc[1]["Images link"]

'https://www.vogue.com/fashion-shows/spring-2011-ready-to-wear/alexander-mcqueen#collection'

In [10]:
data.to_csv("image_links.csv",index=False)

#### Read file in:

In [2]:
data=pd.read_csv("image_links.csv")

In [3]:
# extract the urls and file names in 2 lists
list_links=data["Images link"].to_list()
list_names=data["File_names"].to_list()

In [4]:
len(list_links)==len(list_names)

True

### 2 - Initialise Web Driver 

In [5]:
import time

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  # options for browser

In [7]:
# add in options to not show the webpage while running the code
options = webdriver.ChromeOptions()
options.add_argument('headless')

In [8]:
# load driver
driver = webdriver.Chrome(options=options)

### 3 - Create functions

In [9]:
from PIL import Image
import io

In [10]:
from bs4 import BeautifulSoup

import requests
import urllib.request

In [11]:
def get_images(url_name):
        driver.get(url_name)
        
        image_objects = driver.find_elements_by_class_name("grid-item--image.__loaded")
        
        return image_objects

In [12]:
def get_image_urls(images):
    image_urls=list()
    
    for each_image in images:
           if each_image.get_attribute('srcset') and 'http' in each_image.get_attribute('srcset'):
                   image_urls.append(each_image.get_attribute('srcset'))
    
    image_urls=image_urls[:-1]  # delete the last image url because this is usually the designer waving at the audience
    
    return image_urls

In [13]:
def change_urls_large(url_list):
    good_urls=list()
    
    for i in url_list:
         good_urls.append(i.replace("w_195", "w_500"))     
    ### changes images urls to obtain larger sized images
    
    return good_urls

In [14]:
def open_images(url_name):
    
    time.sleep(1)
    
    image_content = requests.get(url_name).content
    image_file = io.BytesIO(image_content)
    image = Image.open(image_file)
    
    return image


In [15]:
def download_images(image,i):
    
    realname = current_file_name + str(i) ##### take the file name from the list created above
    
    file_path="/Users/my_name/Images/{}.jpeg".format(realname)
    
    with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
            


### 4 - Put functions together

In [16]:
def process_links(url_name):
    
    image_objects=get_images(url_name)
    image_urls=get_image_urls(image_objects)
    good_urls=change_urls_large(image_urls)

    return good_urls


In [17]:
def process_images(image_url_list):
    
    for i in range(0,len(image_url_list)):
        url=image_url_list[i]
        image=open_images(url)
        download_images(image,i)
        
         

### 5 - Run program

In [18]:
N=len(list_links)
N

628

In [None]:
### running the program all at once

for i in range(0,N):
    
    time.sleep(5)
    
    current_url_name=list_links[i]
    current_file_name=list_names[i]
    
    image_urls=process_links(current_url_name)
    process_images(image_urls)
    
    

In [None]:
### running the program in batches

step=10
# N=628


# this loop is for steps (jumps)
for j in range(0,61):
    
    for i in range (step*j,step*(j+1)):
    
        time.sleep(5)
    
        current_url_name=list_links[i]
        current_file_name=list_names[i]
    
        image_urls=process_links(current_url_name)
        process_images(image_urls)
    
    

### Close webdriver

In [44]:
driver.quit()