# Scraping with Selenium

Selenium is a tool initially created to automate tests on websites. It is therefore very useful when information is accessible by clicking on links. A button for example is an element from which it is very difficult to obtain the link. BeautifulSoup then becomes limited.
In this case, use Selenium.

### Load libraries

If you are missing any libraries in the next cell, you'll need to install them before continuing.

In [2]:
import bs4
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import json
import re
import lxml.html
import time
import random
from random import randint
import logging
import collections
from time import gmtime, strftime

import re
from tabulate import tabulate
import os

date = strftime("%Y-%m-%d")

### Install Selenium according to this manual

https://selenium-python.readthedocs.io/installation.html#downloading-python-bindings-for-selenium/bin

*NB: On Linux, put your `geckodriver` (the downloaded extension) in the equivalent path on your machine into `/home/<YOUR_NAME>/.local/bin/`*

We will simulate a search on the official Python website.

In [3]:
import selenium

# The selenium.webdriver module provides all the implementations of WebDriver
# Currently supported are Firefox, Chrome, IE and Remote. The `Keys` class provides keys on
# the keyboard such as RETURN, F1, ALT etc.
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service # # I imported Service for resolving a error. DeprecationWarning: executable_path has been deprecated
from selenium.webdriver.common.by import By  # I imported By for resolving a error.
from selenium.webdriver.chrome.options import Options

options = Options()

url = 'http://www.python.org'

# Here, we create instance of Firefox WebDriver.
# driver = webdriver.Firefox(executable_path=r"C:/webdrivers/geckodriver.exe")

# Here, we create instance of Chrome WebDriver.
# driver = webdriver.Chrome(executable_path=r"C:/webdrivers/chromedriver.exe")

## Instead of the above, I inserted thE following lines to eliminate the error: 
## "DeprecationWarning: executable_path has been deprecated, please pass in a Service object":

#path = ("C:/webdrivers/chromedriver.exe")
#s = Service(path)
#driver = webdriver.Chrome(service=s)

options.add_argument('--start-maximized')
options.add_argument('--disable-infobars')
path = ("C:/webdrivers/chromedriver.exe")
s = Service(path)
driver = webdriver.Chrome(options=options,service=s)
driver.implicitly_wait(30)
driver.get(url)

# The driver.get method will lead to a page given by the URL. WebDriver will wait until the page is fully
# loaded (i.e. the "onload" event has been triggered) before returning the control to your script.
# It should be noted that if your page uses a lot of AJAX calls when loading, WebDriver may not know
# when it was fully loaded.
driver.get("http://www.python.org")

# The following line is a statement confirming that the title contains the word "Python".
assert "Python" in driver.title

# WebDriver offers several methods to search for items using one of the methods
# `find_element_by_...` .
# For example, the input text element can be located by its name attribute by
# using the `find_element_by_name` method.

#elem = driver.find_element_by_name("q")
elem = driver.find_element(By.NAME, "q")

# Then we send keys. This is similar to entering keys using your keyboard.
# Special keys can be sent using the `Keys` class imported in line 7 (from selenium.webdriver.common.keys import Keys).
# For security reasons, we will delete any pre-filled text in the input field
# (for example, "Search") so that it does not affect our search results:
elem.clear()
elem.send_keys("pycon")
elem.send_keys(Keys.RETURN)

# After submitting the page, you should get the result if there is one. To ensure that certain results
# are found, make an assertion:
assert "No results found." not in driver.page_source
driver.close()

#### Open the source code of the webpage and check that the search area (field) is called "q".

Note_yuri: search directly on the website: search area (field) is called "q" (name="q")

In [3]:
<input id="id-search-field" name="q" type="search" role="textbox" class="search-field" placeholder="Search" value="" tabindex="1">

SyntaxError: invalid syntax (192146405.py, line 1)

### Getting a phone number from *leboncoin*

In [None]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

options = Options()

url = "https://www.leboncoin.fr/sports_hobbies/1536839557.htm/"

# Here, we create instance of Firefox WebDriver.
# driver = webdriver.Firefox(executable_path=r"C:/webdrivers/geckodriver.exe")


# Here, we create instance of Chrome WebDriver.

options.add_argument('--start-maximized')
options.add_argument('--disable-infobars')
path = ("C:/webdrivers/chromedriver.exe")
s = Service(path)
driver = webdriver.Chrome(options=options,service=s)
driver.implicitly_wait(30)
driver.get(url)

print(driver.current_url)

python_button = driver.find_elements_by_xpath('//div[@data-reactid="269"]')[0]
python_button.click()

## I update the previous code (because an error):
python_button = driver.find_elements(by=By.XPATH, value='//div[@data-reactid="269"]')[0]    #269
python_button.click()

# And then we use Beautiful soup
soup = BeautifulSoup(driver.page_source)

driver.close()

for elem in soup.find_all("a", attrs={"data-qa-id": "adview_number_phone_contact"}):
    print(elem.text)

# an unestable website, I will try with another:

https://www.leboncoin.fr/sports_hobbies/1536839557.htm/


IndexError: list index out of range

In [6]:
# Other website Inmoweb: Precio

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

options = Options()

# De original website was "https://www.leboncoin.fr/sports_hobbies/1536839557.htm/" 
# I changed it because it is an unstable website. Couch recommendation
# Extract the sale price of the house.

url = "https://www.immoweb.be/en/classified/farmhouse/for-sale/clavier/4560/9601088?searchId=627a3d9037a5e"

options.add_argument('--start-maximized')
options.add_argument('--disable-infobars')
path = ("C:/webdrivers/chromedriver.exe")
s = Service(path)
driver = webdriver.Chrome(options=options,service=s)
driver.implicitly_wait(30)
driver.get(url)

print(driver.current_url)

# Using selenium  # previous code # only as reference # I need check it
#python_button = driver.find_elements_by_xpath('//div[@data-reactid="269"]')[0]
#python_button.click()

# And then we use Beautiful soup
soup = BeautifulSoup(driver.page_source)
    
driver.close()

for elem in soup.find_all("span", attrs={"sr-only"})[0]:
    print(elem.text)

https://www.immoweb.be/en/classified/farmhouse/for-sale/clavier/4560/9601088?searchId=627a3d9037a5e
149000€


### Starting from *leboncoin*, collect all the information available to define the product being sold. Use `selenium` for the telephone number.

### API (Application Program Interface)

A set of tools and methods that allow different applications to interact with each other. In the case of a web service, we can retrieve data dynamically. By using an API correctly, we can thus obtain in real time, the modifications made on a "parent" site.

For example, we will retrieve online news, for example from the "L'équipe" website.

Follow the instructions at https://newsapi.org/s/lequipe-api to retrieve an "API key" connection key

Your API key is: `73bbb95f8ecb49b499113a46481b4af1`


It is frequent that a key does not work after a while (e.g. `5 min`n `30 min`, a day, ...)
So don't jump up if you get an error message back.


Yuri Key: `cfe80559939a4e39a4adf795e64e32fd`

In [6]:
import requests

key = "cfe80559939a4e39a4adf795e64e32fd"
url = "https://newsapi.org/v2/top-headlines?sources=lequipe&apiKey=" + key
response = requests.get(url)

# Here the response format is a json file, it is used as a dictionary
print(response.json())

{'status': 'ok', 'totalResults': 10, 'articles': [{'source': {'id': 'lequipe', 'name': "L'equipe"}, 'author': "L'EQUIPE", 'title': 'La FFF défend le dispositif mis en place pour la finale de la Ligue des champions', 'description': "Dans un communiqué publié ce mardi soir, la Fédération française de football a défendu mardi le dispositif renforcé d'accueil mis en oeuvre samedi pour la finale de la Ligue des Champions au Stade de France, reprenant l'évaluation contestée du gouvernement de…", 'url': 'https://www.lequipe.fr/Football/Actualites/La-fff-defend-le-dispositif-mis-en-place-pour-la-finale-de-la-ligue-des-champions/1336242', 'urlToImage': 'https://medias.lequipe.fr/img-photo-jpg/certains-supporters-de-liverpool-n-avaient-pu-acceder-a-la-rencontre-face-au-real-madrid-c-wilson-pr/1500000001649484/0:0,1998:1332-640-427-75/ffb1d.jpg', 'publishedAt': '2022-05-31T18:42:00+00:00', 'content': 'Dans un communiqué publié ce mardi soir, la FFF affirme que 110\xa0000 personnes ont été dénombr

In [7]:
dictionnary = response.json()
print(dictionnary.keys())

dict_keys(['status', 'totalResults', 'articles'])


In [20]:
for element in list(dictionnary.keys()):
    print("==============================================")
    print("Key: ", element, "// Values: ", dictionnary[element])
    print("++++++++++++++++++++++++++++++++++++++++++++++")

Key:  status // Values:  ok
++++++++++++++++++++++++++++++++++++++++++++++
Key:  totalResults // Values:  10
++++++++++++++++++++++++++++++++++++++++++++++
Key:  articles // Values:  [{'source': {'id': 'lequipe', 'name': "L'equipe"}, 'author': "L'EQUIPE", 'title': 'La FFF défend le dispositif mis en place pour la finale de la Ligue des champions', 'description': "Dans un communiqué publié ce mardi soir, la Fédération française de football a défendu mardi le dispositif renforcé d'accueil mis en oeuvre samedi pour la finale de la Ligue des Champions au Stade de France, reprenant l'évaluation contestée du gouvernement de…", 'url': 'https://www.lequipe.fr/Football/Actualites/La-fff-defend-le-dispositif-mis-en-place-pour-la-finale-de-la-ligue-des-champions/1336242', 'urlToImage': 'https://medias.lequipe.fr/img-photo-jpg/certains-supporters-de-liverpool-n-avaient-pu-acceder-a-la-rencontre-face-au-real-madrid-c-wilson-pr/1500000001649484/0:0,1998:1332-640-427-75/ffb1d.jpg', 'publishedAt': '20

In [10]:
# And now we have lists in dictionaries(it's a JSON file actually but it's very similar)
# We will discover the information of the article key.

for element in enumerate(dictionnary["articles"]):
    print("=================================")
    print(element)

(0, {'source': {'id': 'lequipe', 'name': "L'equipe"}, 'author': "L'EQUIPE", 'title': 'La FFF défend le dispositif mis en place pour la finale de la Ligue des champions', 'description': "Dans un communiqué publié ce mardi soir, la Fédération française de football a défendu mardi le dispositif renforcé d'accueil mis en oeuvre samedi pour la finale de la Ligue des Champions au Stade de France, reprenant l'évaluation contestée du gouvernement de…", 'url': 'https://www.lequipe.fr/Football/Actualites/La-fff-defend-le-dispositif-mis-en-place-pour-la-finale-de-la-ligue-des-champions/1336242', 'urlToImage': 'https://medias.lequipe.fr/img-photo-jpg/certains-supporters-de-liverpool-n-avaient-pu-acceder-a-la-rencontre-face-au-real-madrid-c-wilson-pr/1500000001649484/0:0,1998:1332-640-427-75/ffb1d.jpg', 'publishedAt': '2022-05-31T18:42:00+00:00', 'content': 'Dans un communiqué publié ce mardi soir, la FFF affirme que 110\xa0000 personnes ont été dénombrées aux abords du Stade de France alors que 75

In [17]:
# So if we keep going, it gives us another dictionary!
for element in dictionnary["articles"][0].keys():
    print("=================================")
    print("Key:", element , "Values:", dictionnary["articles"][0][element])

Key: source Values: {'id': 'lequipe', 'name': "L'equipe"}
Key: author Values: L'EQUIPE
Key: title Values: La FFF défend le dispositif mis en place pour la finale de la Ligue des champions
Key: description Values: Dans un communiqué publié ce mardi soir, la Fédération française de football a défendu mardi le dispositif renforcé d'accueil mis en oeuvre samedi pour la finale de la Ligue des Champions au Stade de France, reprenant l'évaluation contestée du gouvernement de…
Key: url Values: https://www.lequipe.fr/Football/Actualites/La-fff-defend-le-dispositif-mis-en-place-pour-la-finale-de-la-ligue-des-champions/1336242
Key: urlToImage Values: https://medias.lequipe.fr/img-photo-jpg/certains-supporters-de-liverpool-n-avaient-pu-acceder-a-la-rencontre-face-au-real-madrid-c-wilson-pr/1500000001649484/0:0,1998:1332-640-427-75/ffb1d.jpg
Key: publishedAt Values: 2022-05-31T18:42:00+00:00
Key: content Values: Dans un communiqué publié ce mardi soir, la FFF affirme que 110 000 personnes ont été d

### Make a script that allows you to take details of the last ten news from the team or another site. Store them in a nice CSV or excel file.

In [None]:
# I need to modify the code 
# it is an idea

import time
import random
from random import randint

# Here are the things you will have to do for all links:
# - Slow down the frequency of requests to avoid being identified and therefore banned from the website.
# Use `time.sleep(random.uniform(1.0, 2.0))`
# - Get request object from URL
# - Extract the content into a variable using BeautifulSoup
# - Get title
# - Get synopsis

title = []
synopsis = []


for movie_url in movie_links:
    r = requests.get(movie_url)
    print(movie_url, r.status_code)
    soup = BeautifulSoup(r.content, "lxml")

    for elem in soup.findAll("div", attrs={"class": "titlebar-title titlebar-title-lg"}):
        title.append(elem.text)
        print(elem.text)
        
    for elem in soup.findAll("div", attrs={"class": "content-txt"}):
        synopsis.append(elem.text)
        print(elem.text)
        
        break

time.sleep(random.uniform(1.0, 2.0))


# Check the length of the lists before creating the dataframe
len(title), len(synopsis), len(movie_links)