# Web scraping

# First example
## Importation of module parts 

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys #used to simulate some keyboard keys (Alt, Tab, etc.)
from selenium.webdriver.common.by import By #used to locate elements on website

## Accessing a URL
We access a URL with the `get()` method. 

In [2]:
driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance 
driver.get("http://www.python.org") #navigate to this URL, wait until it is loaded
assert "Python" in driver.title #check if python is in the title

  driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance


## Getting things done

In [None]:
elem = driver.find_element(By.NAME, "q") #finds element from its name attribute
elem.clear() #we clear all potential text in the input element
elem.send_keys("pycon") #then we type our things
elem.send_keys(Keys.RETURN) #and we execute
assert "No results found." not in driver.page_source #to be sure that something is found
driver.close() #the close method closes the tab. Here there is only one so it's equivalent to the quit() method

In [3]:
driver.close() #When our operations are over, we need to close the driver. 

## Finding elements

-  In order to locate an element like the following : `<input type="text" name="passwd" id="passwd-id" />`,  we can use all these different commands : 

``` 
element = driver.find_element(By.ID, "passwd-id")
element = driver.find_element(By.NAME, "passwd")
element = driver.find_element(By.XPATH, "//input[@id='passwd-id']")
element = driver.find_element(By.CSS_SELECTOR, "input#passwd-id")
```

## Interacting with elements
-  One can input some text into an element : 
`element.send_keys("some text")`

-  Text parts append themselves and do not automatically clear themselves. Then, we need to use the `clear()` method. 

- We can navigate through the driver using special keys : 
```
element.send_keys(" and some", Keys.ARROW_DOWN)
```

## About forms
-  Further than with just text inputs, one can interact with options for example : 
```
element = driver.find_element(By.XPATH, "//select[@name='name']")
all_options = element.find_elements(By.TAG_NAME, "option")
for option in all_options:
    print("Value is: %s" % option.get_attribute("value"))
    option.click()
```
-  One can unselect elements : 
```
select = Select(driver.find_element(By.ID, 'id'))
select.deselect_all()
```

-  The `selenium.webdriver.support.ui` package includes specific useful methods for Select objects. 
```
#A few examples
select = Select(driver.find_element(By.NAME, 'name'))
select.select_by_index(index)
select.select_by_visible_text("text")
select.select_by_value(value)
```
-  We can access all selected options for specific drivers as follow : 
```
select = Select(driver.find_element(By.XPATH, "//select[@name='name']"))
all_selected_options = select.all_selected_options
```

We can, in the contrary, unselect all elements from a `SELECT` object: 
```
select = Select(driver.find_element(By.ID, 'selector'))
select.deselect_all()
```
One can access all selected options via the followiing command : `options = select.options`

Eventually, there are different methods to submit a form : 

-  One can do it manually assuming the submit button is identified : `#driver.find_element_by_id("submit").click()` 

-  Or let python try to find the enclosing form and submit it, with : `element.submit()`

# Form example : 

Let's try to automate the answer to Lydia collect links. We got two examples : 

-  The form for a paintball competition : https://collecte.io/paintball-inter-assos-2091521/fr

-  The form for the famous escape week : https://collecte.io/shotgun-escape-week-1987144/fr

# Paintball

## Accessing URL

In [9]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys #used to simulate some keyboard keys (Alt, Tab, etc.)
from selenium.webdriver.common.by import By #used to locate elements on website

In [4]:
driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance 
driver.get("https://collecte.io/paintball-inter-assos-2091521/fr") #navigate to this URL, wait until it is loaded

The general expression to locate elements by their Xpath is as follows : `XPath = //tagname[@Attribute=’Value’]`

In [15]:
#Example with non generic method
paint_name = "//*[@id='val1']" #not general...
element = driver.find_element(By.XPATH, paint_name)


## We'll try to get all inputs completed step by step: 

-  We'll first create the driver

In [92]:
driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance 
driver.get("https://collecte.io/paintball-inter-assos-2091521/fr") #navigate to this URL, wait until it is loaded

  driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance


-  Then, we'll get all labels elements in a list

In [93]:
path = '//div/label'
labels = driver.find_elements(By.XPATH, path)

-  Then, we need a dictionary with all elements and their attribute

In [3]:
dic = {
    'Nom' : 'Caetano',
    'Prénom' : 'Hugo',
    'Numéro de téléphone' : '0619372524',
    'Adresse email' : 'hugocaetano78800@gmail.com',
    'Asso' : 'BDX'
}

In [5]:
def completer(dic, labels):
    """The completer function completes all fields from the dictionary in the elements wich are linked to the 
    labels list's elements. If there is no field that corresponds to an element in dic, it won't return an error 
    message. Then, 'mieux vaut trop que pas assez'"""
    for i in list(dic.keys()) :
        for j in labels:
            path_j = "//*[@id='" + j.get_attribute('for') + "']"
            element_j = driver.find_element(By.XPATH, path_j) #getting the element associated with the j-th label
            if i in j.text:
             element_j.clear() #we clear all potential text in the input element
             attrib_ij = dic.get(i)
             element_j.send_keys(attrib_ij)    

In [94]:
completer(dic, labels) 

We now have all our wanted fields completed. Let's **click on the submit button.**

In [95]:
submit = driver.find_element(By.ID, 'submit-state-lydia') #finding the submit button
submit.click()

# Complete set of options

-  There are various types of answers on a Lydia collect. Here we were able to complete some basic text inputs. Now, let's try to complete multiple option questions. 

-  If, at the end of our completion, there are still some inputs left, we will want to input some random things inside them in order to submit the file. 

We'll use this complete collect to try our code : https://collecte.io/test-pour-bot-lydia-2225285/fr

In [3]:
driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance 
driver.get("https://collecte.io/test-pour-bot-lydia-2225285/fr") #navigate to this URL, wait until it is loaded") #navigate to this URL, wait until it is loaded

  driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance


We can start by :

-  getting all elements with a label
-  completing all the basic fields 

In [6]:
path = '//div/label'
labels = driver.find_elements(By.XPATH, path) #getting elements

dic = {
    'Nom' : 'Caetano',
    'Prénom' : 'Hugo',
    'Numéro de téléphone' : '0619372524',
    'Adresse email' : 'hugocaetano78800@gmail.com',
    'Asso' : 'BDX'
} #listing the fields to complete

completer(dic=dic, labels=labels)#completing the basic fields !

This works well, there are three things left to do: 

-  Add some slider keys and values that we know we will have to complete
-  Add some code to auto complete with the first option an empty slider
-  Add some code to auto complete with random text an empty text field

## Slider key and attribute completion

Let's start investigating. We'll then build a completer-like function.

In [39]:
driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance 
driver.get("https://collecte.io/test-pour-bot-lydia-2225285/fr") #navigate to this URL, wait until it is loaded") #navigate to this URL, wait until it is loaded

  driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance


In [59]:
path = '//div/label'
elements = driver.find_elements(By.XPATH, path)

Actually a select input is very similar to a text input in a Lydia form.

It's just that the input tag is now a select tag with various option tags

In [10]:
elements[4].get_attribute('for') #Here we have our relevant id
option_path = "//*[@id='val5']"
selector = driver.find_element(By.XPATH, option_path)
navette_path = "//option[@value='Navette 20h']"
navette = selector.find_element(By.XPATH, navette_path)
navette.click()

Okkkk, well this works. Now this is a strict usage because if the option doesn't have the exact value we're inputting, nothing will happen.

I think that one good solution would be to provide one quick bot but with strict input names needed, and a slower bot which would for example compute an accuracy score for all inputs to see which one is the nearest to our input, and then complete it with our given attribute. 

Let's finish the simple version. 

Actually, we just need to input the selector name and its attribute in our entry dictionary ! Let's try it. 

In [83]:
new_dic = {
    'Nom' : 'Caetano',
    'Prénom' : 'Hugo',
    'Numéro de téléphone' : '0619372524',
    'Adresse email' : 'hugocaetano78800@gmail.com',
    'Asso' : 'BDX',
    'Choix du tarif' : 'Navette 20h',
    'Un petit mot chiant ?' : 'Non ça ira'
} #listing the fields to complete

path = '//div/label'
elements = driver.find_elements(By.XPATH, path)

Actually, we just need one or two more lines in our function ! One way to differentiate a scrolling input from a text input is to check is .text value

In [86]:
def all_completer(dic, elements):
    """The completer function completes all fields from the dictionary in the elements wich are linked to the 
    labels list's elements. If there is no field that corresponds to an element in dic, it won't return an error 
    message. Then, 'mieux vaut trop que pas assez'"""
    for i in list(dic.keys()) :
        for j in elements:
            path_j = "//*[@id='" + j.get_attribute('for') + "']"
            element_j = driver.find_element(By.XPATH, path_j) #getting the element associated with the j-th label
            if i in j.text:
                if not "\n" in element_j.text : #it is a text input
                    element_j.clear() #we clear all potential text in the input element
                    attrib_ij = dic.get(i)
                    element_j.send_keys(attrib_ij)  
                elif "\n" in element_j.text : #it is a scrolling menu 
                    attrib_ij = dic.get(i) #it is the value we want to select
                    menu_el_path_i = "//option[@value='" + attrib_ij + "'" #we find the clickable element
                    option_i = element_j.find_element(By.XPATH, navette_path) #we need to find IN the selector we already have
                    option_i.click()

In [92]:
all_completer(dic=dic, elements=elements)#completing the basic fields !

[<selenium.webdriver.remote.webelement.WebElement (session="149c88e036b6e79c35aa710df3aa548b", element="6ddeddd4-d2e2-4a7f-882f-c74eb80e07c0")>, <selenium.webdriver.remote.webelement.WebElement (session="149c88e036b6e79c35aa710df3aa548b", element="27a66b8a-d72e-40bf-a301-b49f4c1bfc8e")>, <selenium.webdriver.remote.webelement.WebElement (session="149c88e036b6e79c35aa710df3aa548b", element="1618dbe2-a918-4195-bb93-ebc7cded9ea7")>, <selenium.webdriver.remote.webelement.WebElement (session="149c88e036b6e79c35aa710df3aa548b", element="143cfef9-d178-4f13-ae90-736aa4fc75ff")>]


When we are done, we have to close the driver. 

In [47]:
driver.close()

## Note :
This code works ! The advantage of our function is that an input field with not the exact name, but only containing the key we give in our dictionary would work : 

In [116]:
incomplete_dic = {
    'Nom' : 'Caetano',
    'Prénom' : 'Hugo',
    'Numéro' : '0619372524', #not Numéro de téléphone
    'mail' : 'hugocaetano78800@gmail.com', #Not adresse e-mail
    'Asso' : 'BDX',
    'Choix du tarif' : '20h', #Not navette 20h
    'Un petit mot chiant ?' : 'Non ça ira'
} #listing the fields to complete
path = '//div/label'
driver = webdriver.Chrome(executable_path="/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver") #creates a chrome web driver instance 
driver.get("https://collecte.io/test-pour-bot-lydia-2225285/fr") #navigate to this URL, wait until it is loaded
elements = driver.find_elements(By.XPATH, path)

  driver = webdriver.Chrome(executable_path="/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver") #creates a chrome web driver instance


It works ! Then we just need to complete vaquant fields. 

In [None]:
all_completer(dic=incomplete_dic, elements=elements)

In [87]:
driver.close()

## Auto-complete unrecognized text fields

We need to keep track of which label we have modified, and then at the end of our completer we will complete randomly the not-modified fields. 

In [15]:
def true_completer(dic, elements):
    """The completer function completes all fields from the dictionary in the elements wich are linked to the 
    labels list's elements. If there is no field that corresponds to an element in dic, it won't return an error 
    message. Then, 'mieux vaut trop que pas assez'"""
    was_completed = []
    for i in list(dic.keys()) :
        for j in elements:
            path_j = "//*[@id='" + j.get_attribute('for') + "']"
            element_j = driver.find_element(By.XPATH, path_j) #getting the element associated with the j-th label
            if i in j.text:
                if not "\n" in element_j.text : #it is a text input
                    element_j.clear() #we clear all potential text in the input element
                    attrib_ij = dic.get(i)
                    element_j.send_keys(attrib_ij)  
                elif "\n" in element_j.text : #it is a scrolling menu 
                    attrib_ij = dic.get(i) #it is the value we want to select
                    menu_el_path_i = "//option[@value='" + attrib_ij + "'" #we find the clickable element
                    option_i = element_j.find_element(By.XPATH, navette_path) #we need to find IN the selector we already have
                    option_i.click()
                was_completed.append(j)
    return was_completed

In [117]:
was_comp = true_completer(dic=dic, elements=elements)#completing the basic fields !

In [119]:
uncomp = list(set(elements) - set(was_comp))

In [126]:
uncomp[2].text #champ de texte non complété
path_uncomp = "//*[@id='" + uncomp[2].get_attribute('for') + "']"
element_uncomp = driver.find_element(By.XPATH, path_uncomp) #on doit choper son input 

In [128]:
element_uncomp.send_keys("je ne sais pas")

ok donc pour les champs de texte ça va être simple. réglons le pb des menus déroulants. 

In [131]:
uncomp[0].text correspond à un menu déroulant. 

'Choix du tarif'

On doit encore une fois choper le sélecteur ! 

In [140]:
first_option = "//option[2]" #First option is always Sélectionner
option_i = uncomp[0].find_element(By.XPATH, first_option) #we need to find IN the selector we already have

In [141]:
option_i.text

'Navette 20h'

In [143]:

option_i.click()

In [144]:
driver.close()

Alright then I think we have all the theory. Now, we just need to build that last function :)

In [193]:
def true_completer(dic, elements):
    """The completer function completes all fields from the dictionary in the elements wich are linked to the 
    labels list's elements. If there is no field that corresponds to an element in dic, it won't return an error 
    message. Then, 'mieux vaut trop que pas assez'"""
    was_completed = []
    for i in list(dic.keys()) :
        for j in elements:
            path_j = "//*[@id='" + j.get_attribute('for') + "']"
            element_j = driver.find_element(By.XPATH, path_j) #getting the element associated with the j-th label
            if i in j.text:
                if not "\n" in element_j.text : #it is a text input
                    element_j.clear() #we clear all potential text in the input element
                    attrib_ij = dic.get(i)
                    element_j.send_keys(attrib_ij)  
                elif "\n" in element_j.text : #it is a scrolling menu 
                    attrib_ij = dic.get(i) #it is the value we want to select
                    menu_el_path_i = "//option[@value='" + attrib_ij + "'" #we find the clickable element
                    option_i = element_j.find_element(By.XPATH, navette_path) #we need to find IN the selector we already have
                    option_i.click()
                was_completed.append(j)
    uncomp = list(set(elements) - set(was_completed)) #all untouched elements
    for k in uncomp : #We'll complete all empty fields ! Else we won't be able to sumbit the form
        path_k = "//*[@id='" + k.get_attribute('for') + "']"
        uncomp_k = driver.find_element(By.XPATH, path_k)
        if not "\n" in uncomp_k.text : #it is a text input
                    uncomp_k.clear() #we clear all potential text in the input element
                    uncomp_k.send_keys("Je ne sais pas !")  
        elif "\n" in uncomp_k.text : #it is a scrolling menu 
            first_option_path_k = path_k + "/option[2]" #it's safe bc first option is title and even if no title, two options at minimum
            first_option_k = uncomp_k.find_element(By.XPATH, first_option_path_k)
            first_option_k.click()

Well well well, let's try this. 

In [196]:
incomplete_dic = {
    'Nom' : 'Caetano',
    'Prénom' : 'Hugo',
    'Numéro' : '0619372524', #not Numéro de téléphone
    'mail' : 'hugocaetano78800@gmail.com', #Not adresse e-mail
    'Asso' : 'BDX',
    'Choix du tarif' : '20h', #Not navette 20h
    'Un petit mot chiant ?' : 'Non ça ira'
} #listing the fields to complete
path = '//div/label'
driver = webdriver.Chrome(executable_path="/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver") #creates a chrome web driver instance 
driver.get("https://collecte.io/test-pour-bot-lydia-2225285/fr") #navigate to this URL, wait until it is loaded
elements = driver.find_elements(By.XPATH, path)

  driver = webdriver.Chrome(executable_path="/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver") #creates a chrome web driver instance


In [197]:
true_completer(dic = incomplete_dic, elements = elements)

k : question random
uncomp : Sélectionner
réponse 1 super drôle
réponse 2 moins drôle
first_option : réponse 1 super drôle


In [172]:
k = elements[5]
path_k = k.get_attribute('for')

In [174]:
path_k = "//*[@id='" + path_k + "']"

In [175]:
path_k

"//*[@id='val6']"

In [179]:
uncompk = k.find_element(By.XPATH, path_k)

In [190]:
first_option_path_k = "//*[@id='val6']/option[2]" 

In [191]:
#it's safe bc first option is title and even if no title, two options at minimum
first_option_k = uncompk.find_element(By.XPATH, first_option_path_k)

In [192]:
first_option_k.text

'réponse 1 super drôle'

In [None]:
uncompk = k.find_element(By.XPATH, path_k)

In [None]:
path_k = "//*[@id='" + k.get_attribute('for') + "']"
uncomp_k = driver.find_element(By.XPATH, path_k)

In [160]:
elements[-2].text

'question random'

In [195]:
driver.close()

Okkkkkkkkkkk we have it !!! 

Experiments on load options : 

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys #used to simulate some keyboard keys (Alt, Tab, etc.)
from selenium.webdriver.common.by import By #used to locate elements on website
import time
from selenium.webdriver.chrome.options import Options #We need this to prevent notifications and pop-ups
from selenium.common.exceptions import NoSuchElementException #to try and catch the error we need to import it !
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 

In [3]:
driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance 

  driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance


In [4]:
driver.get("https://collecte.io/simulation-de-sat-2231239/fr")

In [5]:
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "none"   # Just wait until page is interactive

In [6]:
driver2 = webdriver.Chrome(desired_capabilities=caps, executable_path='/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver')

  driver2 = webdriver.Chrome(desired_capabilities=caps, executable_path='/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver')


In [7]:
driver2.get("https://collecte.io/simulation-de-sat-2231239/fr")

In [8]:
path = '//div/label'
elements = driver.find_elements(By.XPATH, path)

In [9]:
path = '//div/label'
elements = driver2.find_elements(By.XPATH, path)

In [14]:
driver.close(); driver2.close()

In [12]:
driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance 
driver.get("https://collecte.io/simulation-de-sat-2231239/fr")
path = '//div/label'
elements = driver.find_elements(By.XPATH, path)

  driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance


In [13]:
driver2 = webdriver.Chrome(desired_capabilities=caps, executable_path='/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver')
driver2.get("https://collecte.io/simulation-de-sat-2231239/fr")
path = '//div/label'
elements = driver2.find_elements(By.XPATH, path)

  driver2 = webdriver.Chrome(desired_capabilities=caps, executable_path='/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver')


from just this step, the none option actually makes sense. 

In [17]:
import functions as fun 
dic = { #Here you have to give your personal pieces of information that could be required in the form
    'Nom' : 'Caetano',
    'Prénom' : 'Hugo',
    'Numéro de téléphone' : '0619372524',
    'Adresse email' : 'hugocaetano78800@gmail.com',
    'Asso' : 'BDX',
    'Numéro étudiant' : '20213566',
    'Promo' : '3A'
}

In [18]:
driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance 
driver.get("https://collecte.io/simulation-de-sat-2231239/fr")
fun.true_completer(driver = driver, dic = dic)
driver.close()

  driver = webdriver.Chrome(executable_path="~/Desktop/proj/chromedriver") #creates a chrome web driver instance


In [19]:
driver2 = webdriver.Chrome(desired_capabilities=caps, executable_path='/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver')
driver2.get("https://collecte.io/simulation-de-sat-2231239/fr")
fun.true_completer(driver = driver2, dic = dic)
driver2.close()

  driver2 = webdriver.Chrome(desired_capabilities=caps, executable_path='/Users/hugocaetano/Desktop/lydia_collect_automator/chromedriver')


Ok, so my first solution was indeed working. 