# TP: Scraping BeautifulSoup & Selenium

## Imports

In [14]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

## Constants

In [15]:
URL_BOOK_BASE = 'https://www.babelio.com'
URL_BOOK_LISTE = '/livrespopulaires_debut.php?p={page_number}'
URL_AUTHORS = 'https://www.babelio.com/auteurspopulaires.php'
URL_API_BASE = "https://en.wikipedia.org/w/api.php"
test_mode = True

## Exercice n°1 : BeautifulSoup Introduction

In [18]:
def get_html_from_link(page_link):
    '''
        Get HTML from web page and parse it.

        :param page_link: link of the webpage we want to scrap
        :type page_link: string
        :return: BeautifulSoup object (HTML parsed)
        :rtype: bs4.BeautifulSoup
    '''
    page = requests.get(page_link)
    soup = BeautifulSoup(page.content, 'html.parser')

    return soup

In [19]:
book_info = get_html_from_link(URL_BOOK_BASE)

ConnectTimeout: HTTPSConnectionPool(host='www.babelio.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x11f469d20>, 'Connection to www.babelio.com timed out. (connect timeout=None)'))

In [None]:
def extract_book_info(book_html):
    # print('func extract_book_info')
    '''
        Extract book infos from URL BOOK HTML

        :param book_html: BeautifulSoup Element that contains book infos
        :type book_html: bs4.element.Tag
        :return:
            - book_links: link to the book page
            - book_title : title of the book
            - book_image_link: link to the image of the book
        :rtype: tuple(string, string, string)
    '''

    # TODO : Get book_link, book_title and book_image_link from book_html and return this tuple
    
    # all_data = get_html_from_link(book_html)
    # book_links = all_data.find('div', class_='list_livre_con').find_all('a')[0]['href'] # 1st book
    
    book_info = get_html_from_link(book_html)
    
    book_links = book_html    
    book_title = book_info.find('h1').find('a').text.replace('\t','').replace('\n','')
    book_image_link = book_info.find('div', class_='livre_con').find('img')['src']
    
    return book_links, book_title, book_image_link

In [10]:
book_html = 'https://www.babelio.com/livres/Skenandore-Pour-lhonneur-de-tous-les-miens/1455974'
book_links, book_title, book_image_link = extract_book_info(book_html)

In [11]:
print('book links -', book_links)
print('book title -', book_title)
print('book image link - ', URL_BOOK_BASE+book_image_link)

book links - https://www.babelio.com/livres/Skenandore-Pour-lhonneur-de-tous-les-miens/1455974
book title - Pour l'honneur de tous les miens
book image link -  https://www.babelio.com/couv/CVT_Pour-lhonneur-de-tous-les-miens-Un-roman-dechira_3436.jpg


In [12]:
def extract_author_info(author_html):
    # print('func extract_author_info')
    '''
        Extract author info from URL BOOK HTML

        :param author_html: BeautifulSoup Element that contains author info
        :type author_html: bs4.element.Tag
        :return:
            - author_links: link to the author page
            - author_name : name of the author
        :rtype: tuple(string, string)
    '''

    # TODO : Get author_links, author_name from author_html and return this tuple
    book_info = get_html_from_link(author_html)
    
#     author_links = book_info.find('div', class_='livre_con').find('span').find('a')['href']
    author_links = book_info.find('div', class_='livre_con').find("span", itemprop="author").find('a')['href']
    author_name = book_info.find('div', class_='livre_con').find('span', itemprop='name').text
    
    return author_links, author_name

In [13]:
author_html = book_links
author_links, author_name = extract_author_info(author_html)

In [14]:
print('author links -', URL_BOOK_BASE + author_links)
print('author name -', author_name)

author links - https://www.babelio.com/auteur/Amanda-Skenandore/629754
author name - Amanda Skenandore


In [15]:
def extract_rate_from_book_page(book_link):
    # print('func extract_rate_from_book_page')
    '''
        Extract rate from book details page

        :param book_link: link of the book we want to extract rate
        :type book_link: string
        :return: rate: rate of the book
        :rtype: float

    '''
    
    # TODO : get html of book page, get rate from parsed html and return rate as float or nan value if there is no rate.
    book_info = get_html_from_link(book_link)
    rate = book_info.find('div', class_='livre_con').find('span', itemprop = 'ratingValue').text 
    if not rate:
        rate = 0
    
    return rate

In [16]:
rate = extract_rate_from_book_page(book_links)

In [17]:
print('rate =', rate)

rate = 4.57


In [18]:
def how_many_books_on_the_page(page_link):
    # print('func how_many_books_on_the_page')
    data = get_html_from_link(page_link).find('div', class_='list_livre_con').find_all('div')
    number_of_books = len(data)
    return number_of_books

# page_link = 'https://www.babelio.com/livrespopulaires_debut.php'
# how_many_books_on_the_page(page_link)

In [19]:
def get_info_from_page(page_link):
    # print('func get_info_from_page')
    '''
        Get Info from Bebelio page that contains list of books

        :param page_link: link of the webpage we want to scrap
        :type: page_link: string
        :return info_list: list that contains book info (book_links,
        book_title, book_image) and author info (author_links, author_name)
        :rtype: List
    '''

    # TODO : get html from page_link, extract books from html (1), 
    #  iterate over books and for each book, extract book info html and author info html, 
    #  use functions to extract book info and author info and store all these
    #  information in a list (2) and return it
    #  Hints :
    #   (1) Analyze html code and use Beautiful soup function to find elements (take a look at html tag
    #       AND class attributes)
    #   (2) Append a tuple of all information for each book
    
    all_data = get_html_from_link(page_link)
    
    # max_book = 4
    max_book = how_many_books_on_the_page(page_link)
    max_book = max_book * 2
    all_books = range(0, max_book, 2)
    
    info_list = list()
    
    for book in all_books:
        # get book link
        book_link_part = all_data.find('div', class_='list_livre_con').find_all('a')[book]['href']
        # print(book_link_part)
    
        # get full link of book
        boook_link = URL_BOOK_BASE + book_link_part
        
        # extract information 
        book_link, book_title, book_image_link = extract_book_info(boook_link)
        author_links, author_name = extract_author_info(boook_link)
        data_book_rate = extract_rate_from_book_page(boook_link)
        
        # get info about one book, make a tuple
        book_info = [book_link, book_title, book_image_link, author_links, author_name, data_book_rate]

        # append info of a book into list of books info
        info_list.append(book_info)
        
    return info_list

In [20]:
# all books
page_link = 'https://www.babelio.com/livrespopulaires_debut.php'

In [21]:
# info_list = get_info_from_page(page_link)
# info_list

In [22]:
def auth_wiki_info(author_name):
    auth_wiki_info = 'todo: auth_wiki_info' 
    return auth_wiki_info 

In [23]:
auth_wiki_info('test')

'todo: auth_wiki_info'

In [24]:
def get_page_link(page_number):
    page_link = f'https://www.babelio.com/livrespopulaires_debut.php?p={page_number}'.format(page_number)
    return page_link

In [25]:
page_link = 'https://www.babelio.com/livrespopulaires_debut.php'

In [26]:
!ls

sample_data


In [27]:
file_name = 'books_info'
file_format = '.csv'
file_name = file_name + file_format
file_name

'books_info.csv'

In [31]:
def get_all_data_from_all_pages():
    # pages = range(1,2)
    # pages = range(1,3)
    pages = range(1,4)
    
    all_page_data = list()
    for page in pages:
        page_link = get_page_link(page)
        page_data = get_info_from_page(page_link)

        all_page_data.extend(page_data)
        
        number_of_books_on_the_page = len(page_data)
        print('Page ', page, ', books: ', number_of_books_on_the_page)
    #         todo: auth_info = auth_wiki_info(author_name)

    return all_page_data

In [32]:
def collect_all_information_and_save(file_name):
    '''
        "Main function" that collects all information from scraping babelio and using wikipedia api:
            - get info from list pages (page 1, 2, 3 and 4)
            - get rate of each book
            - get author information from wikipedia API (bootcamp_cri.api.api_wikipedia),
            store all these information in a pandas dataframe with following columns :
            - links, title, image_link, author_link, author, rate
        and save it in csv file.

        :param file_name: name of the csv file
        :type file_name: string

    '''
    # TODO : Iterate over pages, get information for each page and store it in a list and 
    #  then create a dataframe with
    #  these information then extract rate and put it in a column and 
    #  finally save the dataframe in a CSV file
    all_page_data = get_all_data_from_all_pages()

    book_info_column = ['links', 'title', 'image_link', 'author_link', 'author', 'rate']
    df = pd.DataFrame(all_page_data, columns = book_info_column)  
    display(df.head())

    # df = pd.DataFrame(df_page_data, columns = book_info_column)  
    # print('start csv')
    df.to_csv(file_name)
    print('Data saved into ', file_name, ' file.')

In [33]:
collect_all_information_and_save(file_name)

Page  1 , books:  40
Page  2 , books:  40
Page  3 , books:  104


Unnamed: 0,links,title,image_link,author_link,author,rate
0,https://www.babelio.com/livres/Skenandore-Pour...,Pour l'honneur de tous les miens,/couv/CVT_Pour-lhonneur-de-tous-les-miens-Un-r...,/auteur/Amanda-Skenandore/629754,Amanda Skenandore,4.57
1,https://www.babelio.com/livres/Kristoff-Nevern...,"Nevernight, tome 1 : N'oublie jamais",https://m.media-amazon.com/images/I/519RgNc1rf...,/auteur/Jay-Kristoff/325758,Jay Kristoff,4.42
2,https://www.babelio.com/livres/Cotroneo-La-Mai...,La Maison de verre,/couv/CVT_La-maison-de-verre_9967.jpg,/auteur/Roberto-Cotroneo/190637,Roberto Cotroneo,3.5
3,https://www.babelio.com/livres/Boyer-Au-coeur-...,Au coeur d'une nuit de Noël,https://m.media-amazon.com/images/I/51UJ1DD+BI...,/auteur/Erika-Boyer/412223,Erika Boyer,3.4
4,https://www.babelio.com/livres/Lajoinie-Le-ser...,Le serment des traqueurs,/couv/CVT_Le-serment-des-traqueurs_455.jpg,/auteur/Laetitia-Lajoinie/604238,Laetitia Lajoinie,0.0


Data saved into  books_info.csv  file.


In [146]:
# df = pd.DataFrame(all_page_data, columns = book_info_column)
# display(df.head())
# df.to_csv(file_name)
# print('Data saved into ', file_name, ' file.')

Unnamed: 0,links,title,image_link,author_link,author,rate
0,https://www.babelio.com/livres/Skenandore-Pour...,Pour l'honneur de tous les miens,/couv/CVT_Pour-lhonneur-de-tous-les-miens-Un-r...,/auteur/Amanda-Skenandore/629754,Amanda Skenandore,4.57
1,https://www.babelio.com/livres/Kristoff-Nevern...,"Nevernight, tome 1 : N'oublie jamais",https://m.media-amazon.com/images/I/519RgNc1rf...,/auteur/Jay-Kristoff/325758,Jay Kristoff,4.42
2,https://www.babelio.com/livres/Despentes-Cher-...,Cher connard,/couv/CVT_Cher-connard_325.jpg,/auteur/Virginie-Despentes/3412,Virginie Despentes,3.37
3,https://www.babelio.com/livres/Musso-Angelique...,Angélique,https://m.media-amazon.com/images/I/51ZbEn-loH...,/auteur/Guillaume-Musso/3529,Guillaume Musso,3.75


Data saved into  books_info.csv  file.


In [None]:
# pages = range(1,2)
# pages = range(1,3)
# pages = range(1,4)

In [125]:
# book_info_column = ['links', 'title', 'image_link', 'author_link', 'author', 'rate']
# df_all_page_data = pd.DataFrame(columns = book_info_column)  
# df_all_page_data

Unnamed: 0,links,title,image_link,author_link,author,rate


In [None]:
# 1 - save data, at the end put to df
# 2 - during for add pd data

In [132]:
# all_page_data = list()
# for page in pages:
#     page_link = get_page_link(page)
#     page_data = get_info_from_page(page_link)

#     all_page_data.extend(page_data)
    
#     number_of_books_on_the_page = len(page_data)
#     print('Page ', page, ', books: ', number_of_books_on_the_page)
# #         todo: auth_info = auth_wiki_info(author_name)

# df_all_page_data = pd.DataFrame(all_page_data, columns = book_info_column)  
# display('df_all_page_data',df_all_page_data)


<class 'list'>
Page  1 , books:  2
<class 'list'>
Page  2 , books:  2


'df_all_page_data'

Unnamed: 0,links,title,image_link,author_link,author,rate
0,https://www.babelio.com/livres/Skenandore-Pour...,Pour l'honneur de tous les miens,/couv/CVT_Pour-lhonneur-de-tous-les-miens-Un-r...,/auteur/Amanda-Skenandore/629754,Amanda Skenandore,4.57
1,https://www.babelio.com/livres/Kristoff-Nevern...,"Nevernight, tome 1 : N'oublie jamais",https://m.media-amazon.com/images/I/519RgNc1rf...,/auteur/Jay-Kristoff/325758,Jay Kristoff,4.42
2,https://www.babelio.com/livres/Despentes-Cher-...,Cher connard,/couv/CVT_Cher-connard_325.jpg,/auteur/Virginie-Despentes/3412,Virginie Despentes,3.37
3,https://www.babelio.com/livres/Musso-Angelique...,Angélique,https://m.media-amazon.com/images/I/51ZbEn-loH...,/auteur/Guillaume-Musso/3529,Guillaume Musso,3.75


'all_save'

[['https://www.babelio.com/livres/Skenandore-Pour-lhonneur-de-tous-les-miens/1455974',
  "Pour l'honneur de tous les miens",
  '/couv/CVT_Pour-lhonneur-de-tous-les-miens-Un-roman-dechira_3436.jpg',
  '/auteur/Amanda-Skenandore/629754',
  'Amanda Skenandore',
  '4.57'],
 ['https://www.babelio.com/livres/Kristoff-Nevernight-tome-1--Noublie-jamais/1260554',
  "Nevernight, tome 1 : N'oublie jamais",
  'https://m.media-amazon.com/images/I/519RgNc1rfL._SX195_.jpg',
  '/auteur/Jay-Kristoff/325758',
  'Jay Kristoff',
  '4.42'],
 ['https://www.babelio.com/livres/Despentes-Cher-connard/1423121',
  'Cher connard',
  '/couv/CVT_Cher-connard_325.jpg',
  '/auteur/Virginie-Despentes/3412',
  'Virginie Despentes',
  '3.37'],
 ['https://www.babelio.com/livres/Musso-Angelique/1442449',
  'Angélique',
  'https://m.media-amazon.com/images/I/51ZbEn-loHL._SX195_.jpg',
  '/auteur/Guillaume-Musso/3529',
  'Guillaume Musso',
  '3.75']]

In [None]:
# df_all_page_data

## Exercice n°2: Scraping using BeautifulSoup and aggregate with Wikipedia Api data

In [None]:
def get_author_info_from_wikipedia(author_name):
    '''
        Get author detailed info from wikipedia API

        :param author_name:
        :type: author_nanme: string
        :return: json_response: response of wikipedia API
        :rtype: dict
    '''
    params = {
        'action': "query",
        'titles': author_name,
        'format': "json",
        'prop': 'extracts|categories',
        'explaintext': True,
        'exintro': True

    }

    try:
        req = requests.get(url=URL_API_BASE, params=params)
        json_response = req.json()
        tmp_resp = json_response['query']['pages']
        nb_key = list(tmp_resp.keys())[0]
        clean_json_response = json_response['query']['pages'][nb_key]['extract']
    except:
        return None

    return clean_json_response

In [None]:
test_1 = 'Amanda Skenandore' # false
test_2 = 'Jay Kristoff' # true
author_name = test_2

In [None]:
author_info_from_wikipedia = get_author_info_from_wikipedia(author_name)
print(author_info_from_wikipedia)

In [None]:
df = pd.read_csv(file_name)  

In [None]:
df.head()

In [None]:
def extract_most_read_authors(soup):
    # TODO: Get most read authors by finding html tag & class, add each author in a list and return this list
    authors = list()

    # author = 
    # if author ... :
    #     authors.append(author)

    return authors

In [None]:
url = ''
soup = get_html_from_link(url)

In [None]:
most_read_authors = extract_most_read_authors(soup)
print(most_read_authors)

In [None]:
def extract_number_of_readers(soup):
    # TODO: Get number of readers by html tag & class, add each number of readers in a list and return this list
    
    return nb_readers

In [None]:
url = ''
soup = get_html_from_link(url)

In [None]:
number_of_readers = extract_number_of_readers(soup)
print(number_of_readers)

In [None]:
def extract_all(url):
    # TODO: get soup by extracting html from link, extracting most read authors and exctracting number of readers
    # then build a dataframe and return it
    
    # soup = get_html_from_link(url)
    df_data = None

    size = range(1,2)
    for i in size:
        most_read_authors = extract_most_read_authors(soup)
        number_of_readers = extract_number_of_readers(soup)

        data = most_read_authors, number_of_readers
        df_data += data

    info_column = ['most_read_authors', 'number_of_readers']
    df = pd.DataFrame(df_data, columns = info_column)  

    return df

In [None]:
extract_all(URL_AUTHORS)