# Google Art & Culture - Case study using CRISPS-DM

#### Autors: Manuel Alejandro Aponte, Cristian Beltran, Maria Paula Peña

In this notebook it will webscraping of the page Google Art & Culture

## Objectives
The objective of this notebooks is:

* Download images using webscraping.
* Download images metadata.
* Store all information in a datasheet.

## Prerequisites

* Familiarity with python 
* Lastest version of Google WebDriver, Source: https://chromedriver.chromium.org/
* Install python packages.
* Use VPN (Recomended)

## Background 
This notebook belongs to Google Art & Culture Case Study using CRIPS-DM, where would be include process such as webscraping, exploratory data analysis, ML classificators and dashboards. 

In [None]:
#Package instalation
!pip install pandas
!pip install selenium

In [1]:
#Import packages
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from pprint import pprint
from concurrent.futures import ThreadPoolExecutor
from time import sleep

# Utils

In [13]:
class Manager:
    def __init__(self, driver):
        self.driver = driver
        
    def open(self,color):
        target_page = f'https://artsandculture.google.com/color?col={color}'
        self.driver.get(target_page)
        
    def scroll_down(self,scrolls):
        for j in range(0, scrolls):
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(8)  
            
    def js_details(self):
        script = '''
        var details = [...document.querySelector(".ve9nKb").querySelectorAll('li')];
        var info = details.reduce((acc,item)=>{
            const text = item.textContent;
            const chunks = text.split(':');
            const key = chunks[0];
            const value = chunks.splice(1).join("")
            return {...acc,[key]:value}
        },{})
        return JSON.stringify(info);
        '''
        details = self.driver.execute_script(script)
        return details
     
    def js_pages(self):
        scripts=[
            "var containers = [...document.querySelectorAll('.DuHQbc')];",
            'var img_elements = containers.map(contain => contain.firstElementChild);'
            'return img_elements.map(a => a.href)'
        ]
            
        script = self.mergeJS(scripts) 
        pages = self.driver.execute_script(script)
        return pages
        
    
    def js_images(self):
        scripts=[
            "var containers = [...document.querySelectorAll('.DuHQbc')];",
            'var img_elements= containers.map(contain => contain.firstElementChild);'
            'return img_elements.map(a =>window.getComputedStyle(a, false).backgroundImage)'
        ]
        script = self.mergeJS(scripts) 
        urls_raw = self.driver.execute_script(script)
        urls = list(filter(lambda url:url!='none',urls_raw))
        links = list(map(lambda url:url.split('"')[1],urls))
        print('missing:', len(urls_raw)-len(urls))
        return links
    
    def mergeJS(self,scripts):
        return ''.join(scripts)

class Storage:
    def __init__(self):
        self.storage = []
        
    #Add new register      
    def add(self, url="NULL",data="NULL",category="NULL"):
        register = {
            'url': str(url),
            'data': str(data),
            'category': str(category)
        }
        self.storage.append(register)
    
    def export(self,name):
        df = pd.DataFrame(self.storage)
        df.to_csv(name, index = False)


# Step 1. Configuration

In [14]:
#General Utils
def getPackages():
    print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))

In [15]:
getPackages()

pandas==1.4.3
numpy==1.23.1
selenium.webdriver==4.3.0


In [19]:
###CONSTANTS VARIABLES
COLORS = ["WHITE","PINK","YELLOW","PURPLE","BLUE","TEAL","GREEN","ORANGE","RED","BROWN","BLACK"]
URL= "https://artsandculture.google.com/color"
executable_path = r'chromedriver.exe'
SCROLL_DOWN = 40

In [23]:
def create_driver():
    driver = webdriver.Chrome(executable_path = executable_path)
    return driver

def webscraping(color):
    #Create instances.
    driver = create_driver()
    manager = Manager(driver)
    storage = Storage()
    
    #Open Page.
    manager.open(color)
    print('Scraping Color:', color)
    
    #Scroll page until the end.
    time.sleep(3)
    manager.scroll_down(40)
    time.sleep(7)
    
    #Get picture pages and image links
    pages =manager.js_pages()
    image_links= manager.js_images()
    
    #Show state of current page
    length_items = len(pages)
    
    print('Number of elements:', length_items, color)
    
    #For each picture, get theirs details.
    for i,(page,img_link) in enumerate(zip(pages,image_links)):
        #Show progress
        print('Progress:', i,' de ',length_items)
        
        #Open picture page ,get and store its information.
        driver.get(page)
        time.sleep(0.2)
        details = manager.js_details()
        storage.add(img_link,details,color) 
    #Export data
    filename = color+'.csv'
    print(filename)
    storage.export(filename)
    return True

In [24]:
if __name__ == '__main__':
    result =[]
    with ThreadPoolExecutor(max_workers=5) as exe: 
        print('Running')
        result = exe.map(webscraping,COLORS[6:])
print('End')

Running


  driver = webdriver.Chrome(executable_path = executable_path)


Scraping Color: PURPLE
Scraping Color: PINK
Scraping Color: TEAL
Scraping Color: BLUE
Scraping Color: YELLOW
missing: 0
Number of elements: 121 PURPLE
Progress: 0  de  121
missing: 0
Number of elements: 86 PINK
Progress: 0  de  86
missing: 30
Number of elements: 1245 TEAL
Progress: 0  de  1245
missing: 30
Number of elements: 1245 BLUE
Progress: 0  de  1245
missing: 0
Number of elements: 1013 YELLOW
Progress: 0  de  1013
Progress: 1  de  121
Progress: 1  de  1245
Progress: 1  de  86
Progress: 1  de  1013
Progress: 1  de  1245
Progress: 2  de  1245
Progress: 2  de  121
Progress: 2  de  1013
Progress: 2  de  86
Progress: 2  de  1245
Progress: 3  de  121
Progress: 3  de  1245
Progress: 3  de  1013
Progress: 3  de  86
Progress: 4  de  1245
Progress: 4  de  1013
Progress: 5  de  1245
Progress: 4  de  121
Progress: 4  de  86
Progress: 3  de  1245
Progress: 6  de  1245
Progress: 5  de  1013
Progress: 4  de  1245
Progress: 5  de  121
Progress: 6  de  1013
Progress: 7  de  1245
Progress: 5  de  

Progress: 86  de  1245
Progress: 59  de  121
Progress: 74  de  1245
Progress: 74  de  1013
Progress: 87  de  1245
Progress: 60  de  86
Progress: 60  de  121
Progress: 75  de  1245
Progress: 75  de  1013
Progress: 88  de  1245
Progress: 76  de  1245
Progress: 61  de  121
Progress: 61  de  86
Progress: 89  de  1245
Progress: 76  de  1013
Progress: 77  de  1245
Progress: 62  de  121
Progress: 90  de  1245
Progress: 77  de  1013
Progress: 78  de  1245
Progress: 62  de  86
Progress: 91  de  1245
Progress: 78  de  1013
Progress: 63  de  121
Progress: 79  de  1245
Progress: 92  de  1245
Progress: 79  de  1013
Progress: 63  de  86
Progress: 93  de  1245
Progress: 80  de  1245
Progress: 64  de  121
Progress: 80  de  1013
Progress: 94  de  1245
Progress: 81  de  1245
Progress: 65  de  121
Progress: 64  de  86
Progress: 95  de  1245
Progress: 82  de  1245
Progress: 66  de  121
Progress: 65  de  86
Progress: 81  de  1013
Progress: 96  de  1245
Progress: 83  de  1245
Progress: 66  de  86
Progress: 

Progress: 187  de  1245
Progress: 156  de  1245
Progress: 154  de  1013
Progress: 188  de  1245
Progress: 189  de  1245
Progress: 157  de  1245
Progress: 155  de  1013
Progress: 190  de  1245
Progress: 158  de  1245
Progress: 156  de  1013
Progress: 191  de  1245
Progress: 159  de  1245
Progress: 157  de  1013
Progress: 192  de  1245
Progress: 160  de  1245
Progress: 193  de  1245
Progress: 158  de  1013
Progress: 194  de  1245
Progress: 161  de  1245
Progress: 159  de  1013
Progress: 195  de  1245
Progress: 160  de  1013
Progress: 196  de  1245
Progress: 162  de  1245
Progress: 197  de  1245
Progress: 161  de  1013
Progress: 163  de  1245
Progress: 198  de  1245
Progress: 162  de  1013
Progress: 199  de  1245
Progress: 200  de  1245
Progress: 163  de  1013
Progress: 164  de  1245
Progress: 201  de  1245
Progress: 164  de  1013
Progress: 165  de  1245
Progress: 202  de  1245
Progress: 165  de  1013
Progress: 203  de  1245
Progress: 166  de  1245
Progress: 166  de  1013
Progress: 204  d

Progress: 260  de  1013
Progress: 259  de  1245
Progress: 321  de  1245
Progress: 261  de  1013
Progress: 260  de  1245
Progress: 262  de  1013
Progress: 322  de  1245
Progress: 261  de  1245
Progress: 263  de  1013
Progress: 323  de  1245
Progress: 262  de  1245
Progress: 264  de  1013
Progress: 263  de  1245
Progress: 324  de  1245
Progress: 265  de  1013
Progress: 264  de  1245
Progress: 325  de  1245
Progress: 266  de  1013
Progress: 265  de  1245
Progress: 267  de  1013
Progress: 266  de  1245
Progress: 326  de  1245
Progress: 268  de  1013
Progress: 267  de  1245
Progress: 269  de  1013
Progress: 327  de  1245
Progress: 268  de  1245
Progress: 270  de  1013
Progress: 328  de  1245
Progress: 269  de  1245
Progress: 271  de  1013
Progress: 329  de  1245
Progress:Progress: 272  de  1013
 270  de  1245
Progress: 330  de  1245
Progress: 271  de  1245
Progress: 273  de  1013
Progress: 331  de  1245
Progress: 272  de  1245
Progress: 274  de  1013
Progress: 273  de  1245
Progress: 332  d

Progress: 385  de  1013
Progress: 417  de  1245
Progress: 381  de  1245
Progress: 386  de  1013
Progress: 418  de  1245
Progress: 387  de  1013
Progress: 419  de  1245
Progress: 388  de  1013
Progress: 382  de  1245
Progress: 420  de  1245
Progress: 389  de  1013
Progress: 383  de  1245
Progress: 390  de  1013
Progress: 384  de  1245
Progress: 421  de  1245
Progress: 385  de  1245
Progress: 422  de  1245
Progress: 391  de  1013
Progress: 386  de  1245
Progress: 392  de  1013
Progress: 423  de  1245
Progress: 387  de  1245
Progress: 424  de  1245
Progress: 393  de  1013
Progress: 388  de  1245
Progress: 425  de  1245
Progress: 394  de  1013
Progress: 389  de  1245
Progress: 426  de  1245
Progress: 395  de  1013
Progress: 390  de  1245
Progress: 396  de  1013
Progress: 391  de  1245
Progress: 427  de  1245
Progress: 392  de  1245
Progress: 397  de  1013
Progress: 428  de  1245
Progress: 398  de  1013
Progress: 393  de  1245
Progress: 429  de  1245
Progress: 399  de  1013
Progress: 394  d

Progress: 517  de  1245
Progress: 507  de  1013
Progress: 501  de  1245
Progress: 518  de  1245
Progress: 508  de  1013
Progress: 502  de  1245
Progress: 519  de  1245
Progress: 509  de  1013
Progress: 503  de  1245
Progress: 520  de  1245
Progress: 510  de  1013
Progress: 504  de  1245
Progress: 521  de  1245
Progress: 505  de  1245
Progress: 522  de  1245
Progress: 506  de  1245
Progress: 523  de  1245
Progress: 511  de  1013
Progress: 507  de  1245
Progress: 512  de  1013
Progress: 524  de  1245
Progress: 508  de  1245
Progress: 513  de  1013
Progress: 525  de  1245
Progress: 509  de  1245
Progress: 514  de  1013
Progress: 526  de  1245
Progress: 510  de  1245
Progress: 515  de  1013
Progress: 511  de  1245
Progress: 527  de  1245
Progress: 516  de  1013
Progress: 512  de  1245
Progress: 528  de  1245
Progress: 517  de  1013
Progress: 518  de  1013
Progress: 513  de  1245
Progress: 529  de  1245
Progress: 514  de  1245
Progress: 519  de  1013
Progress: 530  de  1245
Progress: 515  d

Progress: 624  de  1013
Progress: 621  de  1245
Progress: 622  de  1245
Progress: 622  de  1245
Progress: 623  de  1245
Progress: 625  de  1013
Progress: 623  de  1245
Progress: 624  de  1245
Progress: 624  de  1245
Progress: 626  de  1013
Progress: 625  de  1245
Progress: 626  de  1245
Progress: 627  de  1013
Progress: 625  de  1245
Progress: 627  de  1245
Progress: 628  de  1013
Progress: 626  de  1245
Progress: 628  de  1245
Progress: 627  de  1245
Progress: 629  de  1013
Progress: 629  de  1245
Progress: 630  de  1245
Progress: 628  de  1245
Progress: 630  de  1013
Progress: 631  de  1245
Progress: 629  de  1245
Progress: 632  de  1245
Progress: 631  de  1013
Progress: 633  de  1245
Progress: 630  de  1245
Progress: 634  de  1245
Progress: 631  de  1245
Progress: 635  de  1245
Progress: 632  de  1013
Progress: 632  de  1245
Progress: 636  de  1245
Progress: 633  de  1245
Progress:Progress: 633  de  1013
 637  de  1245
Progress: 634  de  1245
Progress: 638  de  1245
Progress: 634  d

Progress: 735  de  1013
Progress: 736  de  1245
Progress: 738  de  1245
Progress: 736  de  1013
Progress: 737  de  1245
Progress: 738  de  1245
Progress: 739  de  1245
Progress: 737  de  1013
Progress: 739  de  1245
Progress:Progress: 740  de  1245
 738  de  1013
Progress: 740  de  1245
Progress: 741  de  1245
Progress: 739  de  1013
Progress:Progress: 741  de  1245
 742  de  1245
Progress: 740  de  1013
Progress: 743  de  1245
Progress: 742  de  1245
Progress: 744  de  1245
Progress: 741  de  1013
Progress: 743  de  1245
Progress: 745  de  1245
Progress: 742  de  1013
Progress: 746  de  1245
Progress: 744  de  1245
Progress: 747  de  1245
Progress: 743  de  1013
Progress: 745  de  1245
Progress: 748  de  1245
Progress: 744  de  1013
Progress: 749  de  1245
Progress: 746  de  1245
Progress: 747  de  1245
Progress: 750  de  1245
Progress: 745  de  1013
Progress: 751  de  1245
Progress: 748  de  1245
Progress: 746  de  1013
Progress: 752  de  1245
Progress: 749  de  1245
Progress: 753  d

Progress: 850  de  1245
Progress: 843  de  1245
Progress: 858  de  1013
Progress: 844  de  1245
Progress: 851  de  1245
Progress: 859  de  1013
Progress: 845  de  1245
Progress: 852  de  1245
Progress: 860  de  1013
Progress: 846  de  1245
Progress: 853  de  1245
Progress: 861  de  1013
Progress: 847  de  1245
Progress: 854  de  1245
Progress: 848  de  1245
Progress: 862  de  1013
Progress: 855  de  1245
Progress: 849  de  1245
Progress: 863  de  1013
Progress: 856  de  1245
Progress: 850  de  1245
Progress: 864  de  1013
Progress: 851  de  1245
Progress: 857  de  1245
Progress: 865  de  1013
Progress: 852  de  1245
Progress: 866  de  1013
Progress: 858  de  1245
Progress: 853  de  1245
Progress: 867  de  1013
Progress: 859  de  1245
Progress: 854  de  1245
Progress: 868  de  1013
Progress: 860  de  1245
Progress: 855  de  1245
Progress: 869  de  1013
Progress: 861  de  1245
Progress: 856  de  1245
Progress: 870  de  1013
Progress: 862  de  1245
Progress: 857  de  1245
Progress: 871  d

Progress: 969  de  1245
Progress: 963  de  1245
Progress: 970  de  1245
Progress: 961  de  1013
Progress: 964  de  1245
Progress: 971  de  1245
Progress: 965  de  1245
Progress: 962  de  1013
Progress: 972  de  1245
Progress: 966  de  1245
Progress: 963  de  1013
Progress: 973  de  1245
Progress: 967  de  1245
Progress: 964  de  1013
Progress: 974  de  1245
Progress: 968  de  1245
Progress: 965  de  1013
Progress: 975  de  1245
Progress: 969  de  1245
Progress: 976  de  1245
Progress: 966  de  1013
Progress: 970  de  1245
Progress: 977  de  1245
Progress: 967  de  1013
Progress: 971  de  1245
Progress: 978  de  1245
Progress: 968  de  1013
Progress: 972  de  1245
Progress: 979  de  1245
Progress: 969  de  1013
Progress: 973  de  1245
Progress: 980  de  1245
Progress: 974  de  1245
Progress: 970  de  1013
Progress: 981  de  1245
Progress: 975  de  1245
Progress: 971  de  1013
Progress: 982  de  1245
Progress: 972  de  1013
Progress: 976  de  1245
Progress: 973  de  1013
Progress: 983  d

Progress: 1112  de  1245
Progress: 1100  de  1245
Progress: 1113  de  1245
Progress: 1101  de  1245
Progress: 1102  de  1245
Progress: 1114  de  1245
Progress: 1115  de  1245
Progress: 1103  de  1245
Progress: 1116  de  1245
Progress: 1117  de  1245
Progress: 1104  de  1245
Progress: 1105  de  1245
Progress: 1118  de  1245
Progress: 1106  de  1245
Progress: 1107  de  1245
Progress: 1119  de  1245
Progress: 1108  de  1245
Progress: 1120  de  1245
Progress: 1121  de  1245
Progress: 1109  de  1245
Progress: 1122  de  1245
Progress: 1110  de  1245
Progress: 1123  de  1245
Progress: 1124  de  1245
Progress: 1111  de  1245
Progress: 1112  de  1245
Progress: 1125  de  1245
Progress: 1113  de  1245
Progress: 1126  de  1245
Progress: 1114  de  1245
Progress: 1127  de  1245
Progress: 1115  de  1245
Progress: 1128  de  1245
Progress: 1116  de  1245
Progress: 1129  de  1245
Progress: 1117  de  1245
Progress: 1130  de  1245
Progress: 1118  de  1245
Progress: 1131  de  1245
Progress: 1119  de  1245


In [9]:
[0,1,2,3,4,5][1:6]

[1, 2, 3, 4, 5]