# Webscraping Color Palette

## Scraping rules
- You should check a site's terms and conditions before you scrape them. It's their data and they likely have some rules to govern it.
- Be nice - A computer will send web requests much quicker than a user can. Make sure you space out your requests a bit so that you don't hammer the site's server.
- Scrapers break - Sites change their layout all the time. If that happens, be prepared to rewrite your code.
- Web pages are inconsistent - There's sometimes some manual clean up that has to happen even after you've gotten your data.

<h3>Import necessary modules</h3>

In [None]:
import numpy as np
import pandas as pd
import os
import requests
# from bs4 import BeautifulSoup

In [None]:
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options  
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By 
# from selenium.webdriver.support.ui import WebDriverWait 
# from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import TimeoutException

### Selenium headless driver options

In [None]:
chrome_options = Options()  
chrome_options.add_argument("--headless")
chrome_options.add_argument("--incognito")
chrome_options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'
driver_dir = '../_driver_headless/chromedriver'

### Selenium browser (not headless) options

In [None]:
browser_options = Options()  
browser_options.add_argument("--incognito")
browser_options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary'

### Start chrome browser instance

In [None]:
browser = webdriver.Chrome(executable_path=os.path.abspath(driver_dir), chrome_options=browser_options)  

### Scrape with browser

In [None]:
url = 'http://www.color-hex.com/color-palette'
browser.get(url)
browser.current_url

### Test X-path

In [None]:
url = 'http://www.color-hex.com/color-palette/'
keyword = 61326
url += str(keyword)
browser.get(url)

In [None]:
[tag.text for tag in browser.find_elements_by_xpath('//td/a')]

In [None]:
[tag.text for tag in browser.find_elements_by_xpath('//em')]

In [None]:
url = 'http://www.color-hex.com/color-palette/{}'.format(150)
browser.get(url)

### Test GET status code

### Scrape all palettes: #0 to #100000 and write to store 

### Restart browser

In [None]:
browser = webdriver.Chrome(executable_path=os.path.abspath(driver_dir), chrome_options=browser_options)  

In [None]:
palettes = []
url = 'http://www.color-hex.com/color-palette/'
browser.set_page_load_timeout(10)

for i in range(60000, 70000):
    try:
        url = 'http://www.color-hex.com/color-palette/{}'.format(i)
        browser.get(url)
        print('.', end='', flush=True)
        pal_name = browser.find_elements_by_xpath('//em')
        if pal_name:
            name = [tag.text for tag in pal_name]
            hexs = [tag.text for tag in browser.find_elements_by_xpath('//td/a')]
            item = (i, ''.join(name), url, hexs)
            print(item)
            palettes.append(item)
    except TimeoutException as ex:
        print(ex)
        continue
        
    # store results in batches during scraping and append dataframe
    if  i % 20 == 0:
        print('.', end='', flush=True)
        %store palettes
        df_palettes = pd.DataFrame(palettes, columns=['number', 'name', 'url', 'hexs'])
        df_palettes['hexs'] = df_palettes['hexs'].astype(list)
        df_palettes.to_csv('../_data/col_hex_palettes.csv', mode='a', index=False)
        palettes = []
        

In [None]:
for i in :
    df_palettes['hexs'] = df_palettes['hexs'].astype(list)
    df_palettes.loc[i, 'hexs']
    pal = sns.color_palette(df_palettes.loc[i, 'hexs'])
    sns.palplot(sns.color_palette(pal))

### Refresh palettes

In [None]:
%store -r
palettes

In [None]:
import pandas as pd
df_palettes = pd.DataFrame(palettes)
df_palettes

In [None]:
# Append to csv file
df_palettes.to_csv('../_data/color_hex_palettes.csv', mode='a', index=False)

In [None]:
# df = pd.read_csv('../_data/color_hex_palettes.csv')
# df.columns
# df = df[['0', '1', '2', '3']]
# df.to_csv('../_data/color_hex_palettes.csv', index=False)

In [None]:
df_palettes = pd.read_csv('../_data/color_hex_palettes.csv')

In [None]:
import seaborn as sns
import re

for pal in df_palettes.loc[:5, '3']:
    pall = re.sub('[\[\]]', '', pal)
    pall = list(pall.split(','))
    print(pall, type(pall))
    pal = sns.color_palette(pal)
    sns.palplot(sns.color_palette(pal))