# Webscraping Zara

## Scraping rules
- You should check a site's terms and conditions before you scrape them. It's their data and they likely have some rules to govern it.
- Be nice - A computer will send web requests much quicker than a user can. Make sure you space out your requests a bit so that you don't hammer the site's server.
- Scrapers break - Sites change their layout all the time. If that happens, be prepared to rewrite your code.
- Web pages are inconsistent - There's sometimes some manual clean up that has to happen even after you've gotten your data.

<h3>Import necessary modules</h3>

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import os
import time

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By 
# from selenium.webdriver.support.ui import WebDriverWait 
# from selenium.webdriver.support import expected_conditions as EC 
# from selenium.common.exceptions import TimeoutException

### Selenium headless driver options

In [None]:
chrome_options = Options()  
chrome_options.add_argument("--headless")
chrome_options.add_argument("--incognito")
chrome_options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'
driver_dir = '../_driver_headless/chromedriver'

### Selenium browser (not headless) options

In [None]:
browser_options = Options()  
browser_options.add_argument("--incognito")
browser_options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'

### Scrape with browser

In [None]:
# menu_man = driver.find_element_by_partial_link_text('MAN').click()
url ='https://www.zara.com/uk/en/search?searchTerm='
keyword = 'man trousers'
url += keyword

In [None]:
browser = webdriver.Chrome(executable_path=os.path.abspath(driver_dir), chrome_options=browser_options)  
browser.get(url)
browser.current_url

### Scroll until last image is loaded before scraping

In [None]:
SCROLL_PAUSE_TIME = 1
SCROLL_HEIGHT = 'document.body.scrollHeight'

for i in range(50):
    try:
        time.sleep(SCROLL_PAUSE_TIME)
        browser.execute_script("window.scrollTo(0, {});".format(SCROLL_HEIGHT))
    except:
        print(i)
    finally:
        print('ready')

In [None]:
# SCROLL_PAUSE_TIME = 6
# SCROLL_HEIGHT = 400 #'document.body.scrollHeight'

df = pd.DataFrame()
print('loading image: ')
for i, tag in enumerate(browser.find_elements_by_xpath('//*[@id="products"]/ul/li/a/div/img')):
    # Scroll every 4 items
#     if i % 4 == 0:
#         browser.execute_script("window.scrollTo(0, {});".format(SCROLL_HEIGHT))
#         time.sleep(SCROLL_PAUSE_TIME)
#         # limit to 40 images
#         if i > 40:
#             break
    try:
        time.sleep(1)
        id_ = tag.get_attribute('id')
        alt = tag.get_attribute('alt')
        src = tag.get_attribute('src')
        df.loc[i, 'id'] = id_.split('-')[2]
        df.loc[i, 'ts'] = src.split('=')[1]
        df.loc[i, 'description'] = alt
        df.loc[i, 'source'] = src
    except Exception as e:
        print('\n', repr(e))
    print('{}{}'.format('\b' * len(str(i)), i), end='', flush=True)

In [None]:
df

### Headless scraping of images

In [None]:
driver = webdriver.Chrome(executable_path=os.path.abspath(driver_dir), chrome_options=chrome_options) 
driver.get(url)
driver.current_url

### Scroll until last image is loaded before scraping

In [None]:
SCROLL_PAUSE_TIME = 1
SCROLL_HEIGHT = 'document.body.scrollHeight'

for i in range(50):
    try:
        time.sleep(SCROLL_PAUSE_TIME)
        driver.execute_script("window.scrollTo(0, {});".format(SCROLL_HEIGHT))
    except:
        print(i)
    finally:
        print('ready')

### Get soup

In [None]:
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')

In [None]:
for image in soup.find_all('img'):
    print(image['src'])

### Scrape images by tag name

In [None]:
images = []
for tag in driver.find_elements_by_tag_name('img'):
    src = tag.get_attribute('src')
    alt = tag.get_attribute('alt')
#     print(alt, src)
    images.append((alt, src))

In [None]:
images

In [None]:
df_1 = pd.DataFrame()
for i, tag in enumerate(driver.find_elements_by_tag_name('img')):
#     time.sleep(0.5)
    df_1.loc[i, 'src'] = tag.get_attribute('src')
    df_1.loc[i,'alt'] = tag.get_attribute('alt')

In [None]:
df_1.info()

### Scrape images by Xpath

In [None]:
xpath = '//*[@id="products"]/ul/li/a/div/img'

df_2 = pd.DataFrame()
for i, tag in enumerate(driver.find_elements_by_xpath(xpath)):
    try:
        id_ = tag.get_attribute('id')
        alt = tag.get_attribute('alt')
        src = tag.get_attribute('src')
        df_2.loc[i, 'id'] = id_.split('-')[2]
        df_2.loc[i, 'ts'] = src.split('=')[1]
        df_2.loc[i, 'description'] = alt
        df_2.loc[i, 'source'] = src
    except Exception as e:
        print('\n', repr(e))

In [None]:
df_2.sample(10)

### Save to Excel

In [None]:
# index startrow/startcol N means value is inserted at N+1
writer = pd.ExcelWriter('../_data/zara_mens_trousers.xlsx')
df_1.to_excel(writer, 'trousers')
writer.save()

## Download and save images

In [None]:
# !mkdir images

In [None]:
images = None
for key in images:
    r = requests.get(styles[key], allow_redirects=True)
    open('../_data/images/{}.jpg'.format(key), 'wb').write(r.content)

In [None]:
#### Take screenshot

In [None]:
driver.get_screenshot_as_file('sample_screenshot_2.png');

In [None]:
# !ls images/

### TODO click menu link

In [None]:
for tag in soup.find_all('li'):
    try:
        image = tag.a.img
        if tag.a.text[:3] == 'MAN':
            print(tag.a.text)
            tag.a.click()
    except:
        continue

In [None]:
# menu_man = driver.find_element_by_partial_link_text('MAN').click()

### TODO use search field interactively

In [None]:
search_field = driver.find_element_by_class_name("search")  
search_field
# search_field.clear()
# search_field.send_keys(keyword)
# search_field.send_keys(Keys.RETURN)  