## Crawling for AllModern: (Bed)Rooms

Get images off of the following website.

See more at: https://www.allmodern.com/shop-the-look/sl1/bedroom-ideas

In [None]:
from selenium import webdriver
import pandas as pd
import urllib.request
from pathlib import Path
import sys
import time

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.allmodern.com/shop-the-look/sl1/bedroom-ideas')

In [None]:
def getAllImagesOnPage(df_typeOfSpace, file_typeOfSpace):
    imageCols = driver.find_elements_by_css_selector(".u-size4of12")
    df = pd.DataFrame(columns=["name","typeOfSpace","imageLink"])
    
    for col in imageCols:
        noImagesCollected = 0 # number of images downloaded so far
        images = col.find_elements_by_css_selector(".ImageComponent-image") # list of all image elements

        # Slowly reduce len(images) until all images in the column have been downloaded
        while len(images) != 0:
            image = images[0]
            name = image.get_attribute("alt")
            if "/" in name:
                name = name.replace("/", "-") # make sure it doesn't mistake '/' for directory
            fullName = "images/" + file_typeOfSpace + "/" + name + ".jpg"
            imgFile = Path(fullName)
            for i in range(1, sys.maxsize):
                if imgFile.is_file(): # there may be overlapping file names
                    if i == 1:
                        name += str(i)
                    elif i == 10 or i == 100: # different number of digits from previous i
                        name = name[:-(len(str(i))-1)] + str(i)
                    else:
                        name = name[:-(len(str(i)))] + str(i)
                    fullName = "images/" + file_typeOfSpace + "/" + name + ".jpg"
                    imgFile = Path(fullName)
                
                else:
                    break # as soon as the file name is valid
            
            # Download last image link in srcset
            URL = image.get_attribute("srcset").split(',')[-1].split(' ')[0]
            urllib.request.urlretrieve(URL,fullName)
            noImagesCollected += 1
            
            imgInfo = {
                "name": name,
                "typeOfSpace": df_typeOfSpace, # Need to query df later for photos that don't match this type
                "imageLink": URL
            }
            
            # Pandas 데이터 입력
            df.loc[len(df)] = imgInfo
            
            # If more images need to be loaded, scroll down
            if image == images[-1]:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(10) # Wait for images to load
            
            # Discount the downloaded image from images
            images = col.find_elements_by_css_selector(".ImageComponent-image")[noImagesCollected:]
        
    df.to_csv("data/" + file_typeOfSpace + ".csv", mode='a', header=False)

Note: I have not written a (recursive) code for getting all images on all pages for AllModern because there is a chptcha that needs to be filled out per page & the pagination button element is uninteractable through Selenium. Such code may be useful for scraping from other websites, however.

In [None]:
getAllImagesOnPage(df_typeOfSpace="Room", file_typeOfSpace="rooms") # run on each page

## Other Spaces
Take a look and run the algorithm for the following links. Must remember to change the path to which the image is downloaded, as well as the csv file the image data is stored in.

https://www.allmodern.com/shop-the-look/sl1/livingroom-ideas <br>
https://www.allmodern.com/shop-the-look/sl1/bathroom-ideas <br>
https://www.allmodern.com/shop-the-look/sl1/kitchen-ideas

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.allmodern.com/shop-the-look/sl1/livingroom-ideas')

In [None]:
getAllImagesOnPage(df_typeOfSpace="Living Room", file_typeOfSpace="livingrooms") # run on each page

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.allmodern.com/shop-the-look/sl1/bathroom-ideas')

In [None]:
getAllImagesOnPage(df_typeOfSpace="Bathroom", file_typeOfSpace="bathrooms") # run on each page

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.allmodern.com/shop-the-look/sl1/kitchen-ideas')

In [None]:
getAllImagesOnPage(df_typeOfSpace="Kitchen", file_typeOfSpace="kitchens") # run on each page