## Scraping Images from Homify

Will scrape images from Homify, a website for interior design. Note that images are placed in separate folders and recorded in separate .csv files from the other websites.

In [None]:
from selenium import webdriver
import pandas as pd
import urllib.request
from pathlib import Path
import sys
import time

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.homify.co.uk/rooms/bedroom')

In [None]:
def getAllImagesOnPage(df_typeOfSpace, file_typeOfSpace):
    images = driver.find_elements_by_css_selector(".-horizontal- .-horizontal- li")
    new_images = []
    for image in images:
        if image.get_attribute("id"): # if the image is not an ad
            new_images.append(image)
    df = pd.DataFrame(columns=["name","typeOfSpace","imageLink"])
    
    for image in new_images:
        name = image.get_attribute("id") # we do not need a renaming system because ids are unique
        fullName = "images/homify/" + file_typeOfSpace + "/" + name + ".jpg"
        
        # Get image link (but change to a bigger size)
        URL = image.find_elements_by_css_selector(".js-photo-link source")[0].get_attribute("srcset")
        URL = URL.split(' ')[0]
        ogSize = int(URL.split(':')[2].split('/')[0].split('_')[1])
        newSize = ogSize * 2
        URL = URL.replace(str(ogSize), str(newSize), 1)
        print(URL)
        urllib.request.urlretrieve(URL,fullName)
            
        imgInfo = {
            "name": name,
            "typeOfSpace": df_typeOfSpace, # Need to query df later for photos that don't match this type
            "imageLink": URL
        }
            
        # Pandas 데이터 입력
        df.loc[len(df)] = imgInfo
        
    df.to_csv("data/homify_" + file_typeOfSpace + ".csv", mode='a', header=False)

We now write a code to get the images on every single page.

In [None]:
def getImagesOnAllPages(df_typeOfSpace, file_typeOfSpace):
    getAllImagesOnPage(df_typeOfSpace, file_typeOfSpace)
    links = driver.find_elements_by_css_selector(".pagination--next-page-link")
    nextButton = ""
    for link in links:
        if link.get_attribute("rel") == "next":
            nextButton = link
            break
    if nextButton != "": # if the nextButton exists
        driver.get(nextButton.get_attribute("href")) # go to the next page
        time.sleep(10) # wait for the page to load
        getImagesOnAllPages(df_typeOfSpace, file_typeOfSpace)
    else: # if it is the last page
        return None

In [None]:
getImagesOnAllPages(df_typeOfSpace="Room", file_typeOfSpace="rooms")

## Other Spaces

Run the image scraping algorithm for the following links. Must remember to change the path to which the image is downloaded, as well as the csv file the image data is stored in.

https://www.homify.co.uk/rooms/living-room <br>
https://www.homify.co.uk/rooms/bathroom <br>
https://www.homify.co.uk/rooms/kitchen

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.homify.co.uk/rooms/living-room')

In [None]:
getImagesOnAllPages(df_typeOfSpace="Living Room", file_typeOfSpace="livingrooms")

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.homify.co.uk/rooms/bathroom')

In [None]:
getImagesOnAllPages(df_typeOfSpace="Bathroom", file_typeOfSpace="bathrooms")

In [None]:
driver = webdriver.Chrome()
driver.get('https://www.homify.co.uk/rooms/kitchen')

In [None]:
getImagesOnAllPages(df_typeOfSpace="Kitchen", file_typeOfSpace="kitchens")