## Scraping Images from Dwell

Will now scrape images from Dwell, a website for interior design. Note that images are placed in separate folders and recorded in separate .csv files from the other websites.

In [None]:
from selenium import webdriver
import pandas as pd
import urllib.request
from pathlib import Path
import sys
import time

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.dwell.com/photos/bedroom')

In [None]:
def getAllImagesOnPage(df_typeOfSpace, file_typeOfSpace):
    images = driver.find_elements_by_css_selector("._3Mfjz-rsF9nA1eAreF6V0E ._1hsigH-Inte6wc2OsrF32b img")
    df = pd.DataFrame(columns=["name","typeOfSpace","imageLink"])
    
    for image in images:
        name = image.get_attribute("alt")
        if len(name) > 250: # if file path is too long
            name = name[:250]
        if "/" in name in name: # Do not mistake slashes for directories
            name = name.replace("/", "-")
        fullName = "images/dwell/" + file_typeOfSpace + "/" + name + ".jpg"
        imgFile = Path(fullName)
        for i in range(1, sys.maxsize):
            if imgFile.is_file(): # there may be overlapping file names
                if i == 1:
                    name += str(i)
                elif i == 10 or i == 100 or i == 1000: # different number of digits from previous i
                    name = name[:-(len(str(i))-1)] + str(i)
                else:
                    name = name[:-(len(str(i)))] + str(i)
                fullName = "images/dwell/" + file_typeOfSpace + "/" + name + ".jpg"
                imgFile = Path(fullName)
                
            else:
                break # as soon as the file name is valid
            
        # Get image link (change small or medium thumbnail to large by replacing URL)
        URL = image.get_attribute("src").replace("small", "large", 1)
        URL = URL.replace("medium", "large", 1)
        URL = URL.replace("thumbnail", "large", 1)
        print(URL)
        urllib.request.urlretrieve(URL,fullName)
            
        imgInfo = {
            "name": name,
            "typeOfSpace": df_typeOfSpace, # Need to query df later for photos that don't match this type
            "imageLink": URL
        }
            
        # Pandas 데이터 입력
        df.loc[len(df)] = imgInfo
        
    df.to_csv("data/dwell_" + file_typeOfSpace + ".csv", mode='a', header=False)

We now write a code to get the images on every single page.

In [None]:
def getImagesOnAllPages(df_typeOfSpace, file_typeOfSpace):
    getAllImagesOnPage(df_typeOfSpace, file_typeOfSpace)
    links = driver.find_elements_by_css_selector("._3vSCNYHJe9mhb1dM9cJs87")
    nextButton = ""
    for link in links:
        if link.get_attribute("rel") == "next":
            nextButton = link
            break
    if nextButton != "": # if the nextButton exists
        driver.get(nextButton.get_attribute("href")) # go to the next page
        time.sleep(10) # wait for the page to load
        getImagesOnAllPages(df_typeOfSpace, file_typeOfSpace)
    else: # if it is the last page
        return None

In [None]:
getImagesOnAllPages(df_typeOfSpace="Room", file_typeOfSpace="rooms")

## Other Spaces

Run the image scraping algorithm for the following links. Must remember to change the path to which the image is downloaded, as well as the csv file the image data is stored in.

https://www.dwell.com/photos/living <br>
https://www.dwell.com/photos/bath <br>
https://www.dwell.com/photos/kitchen

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.dwell.com/photos/living')

In [None]:
getImagesOnAllPages(df_typeOfSpace="Living Room", file_typeOfSpace="livingrooms")

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.dwell.com/photos/bath')

In [None]:
getImagesOnAllPages(df_typeOfSpace="Bathroom", file_typeOfSpace="bathrooms")

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.dwell.com/photos/kitchen')

In [None]:
getImagesOnAllPages(df_typeOfSpace="Kitchen", file_typeOfSpace="kitchens")