# <h1 style="text-align:center;">Audible Scraping</h1>

The mini project is to scrape and extract all viable information of audible.com

## Importing libraries

In [8]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pandas as pd
import numpy as np

In [10]:
options = Options()
# we want to see how the process will happen, so we change to False
options.headless = False
# this increases the screen so you can scrape more data, 
# options.add_argument('window-size=1920x1080')


web = "https://www.audible.com/search"
path = r'C:\Users\Asus\Desktop\webscraping\chromedriver-win64\chromedriver.exe'
driver = webdriver.Chrome(path, options=options)
driver.get(web)

# to maximize window

driver.maximize_window()

# pagination (locate the scroll bar for the pages)
pagination = driver.find_element_by_xpath('//ul[contains(@class, "pagingElements")]')
pages = pagination.find_elements_by_xpath('li')
# last element in the page bar is -1, second to last -2. 
last_page = int(pages[-2].text)

current_page = 1
book_title = []
book_author = []
book_length = []
book_language = []

while current_page <= last_page:
    #time.sleep(2)
    container = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'adbl-impression-container ')))
    # container = driver.find_element_by_class_name('adbl-impression-container ')
    products = WebDriverWait(container, 5).until(EC.presence_of_all_elements_located((By.XPATH, './/li[contains(@class, "productListItem")]')))
    # products = container.find_elements_by_xpath('.//li[contains(@class, "productListItem")]')


    for product in products:
        book_title.append(product.find_element_by_xpath('.//h3[contains(@class,"bc-heading")]').text)
        book_author.append(product.find_element_by_xpath('.//li[contains(@class,"authorLabel")]').text)
        book_length.append(product.find_element_by_xpath('.//li[contains(@class,"runtimeLabel")]').text)
        book_language.append(product.find_element_by_xpath('.//li[contains(@class,"languageLabel")]').text)
    
    current_page = current_page + 1
    try:
        next_page = driver.find_element_by_xpath('//span[contains(@class, "nextButton")]')
        next_page.click()
    except:
        pass


driver.quit()

df_books = pd.DataFrame({'title': book_title, 'author': book_author, 'length': book_length, 'language': book_language})
df_books.to_csv('audible.csv', index=False)
print(df_books)

                                      title  \
0                                  Code Red   
1     Magic Strikes (Dramatized Adaptation)   
2                          Creature Feature   
3                                 Elon Musk   
4                                 Salt Kiss   
..                                      ...   
485                       Cutting for Stone   
486                            Pretty Girls   
487                       The Rascor Plains   
488                            The 10X Rule   
489  How to Talk So Little Kids Will Listen   

                                                author  \
0                          By: Vince Flynn, Kyle Mills   
1                                    By: Ilona Andrews   
2    By: Joe Hill, Grady Hendrix, Josh Malerman, Pa...   
3                                  By: Walter Isaacson   
4                                    By: Sierra Simone   
..                                                 ...   
485                          

In [11]:
df_books

Unnamed: 0,title,author,length,language
0,Code Red,"By: Vince Flynn, Kyle Mills",Length: 9 hrs and 26 mins,Language: English
1,Magic Strikes (Dramatized Adaptation),By: Ilona Andrews,Length: 10 hrs and 25 mins,Language: English
2,Creature Feature,"By: Joe Hill, Grady Hendrix, Josh Malerman, Pa...",Length: 7 hrs and 41 mins,Language: English
3,Elon Musk,By: Walter Isaacson,Length: 20 hrs and 27 mins,Language: English
4,Salt Kiss,By: Sierra Simone,Length: 9 hrs and 35 mins,Language: English
...,...,...,...,...
485,Cutting for Stone,By: Abraham Verghese,Length: 23 hrs and 54 mins,Language: English
486,Pretty Girls,By: Karin Slaughter,Length: 20 hrs,Language: English
487,The Rascor Plains,By: Phil Tucker,Length: 24 hrs and 10 mins,Language: English
488,The 10X Rule,By: Grant Cardone,Length: 7 hrs and 24 mins,Language: English


# Basic preprocessing

In [13]:
# Remove 'By: ' from author column
df_books['author'] = df_books['author'].str.replace('By: ', '', regex=False)

# Remove 'Language: ' from language column
df_books['language'] = df_books['language'].str.replace('Language: ', '', regex=False)

# Remove 'Length: ' from length column
df_books['length'] = df_books['length'].str.replace('Length: ', '', regex=False)

print(df_books)

                                      title  \
0                                  Code Red   
1     Magic Strikes (Dramatized Adaptation)   
2                          Creature Feature   
3                                 Elon Musk   
4                                 Salt Kiss   
..                                      ...   
485                       Cutting for Stone   
486                            Pretty Girls   
487                       The Rascor Plains   
488                            The 10X Rule   
489  How to Talk So Little Kids Will Listen   

                                                author              length  \
0                              Vince Flynn, Kyle Mills   9 hrs and 26 mins   
1                                        Ilona Andrews  10 hrs and 25 mins   
2    Joe Hill, Grady Hendrix, Josh Malerman, Paul T...   7 hrs and 41 mins   
3                                      Walter Isaacson  20 hrs and 27 mins   
4                                        Sier

In [14]:
df_books

Unnamed: 0,title,author,length,language
0,Code Red,"Vince Flynn, Kyle Mills",9 hrs and 26 mins,English
1,Magic Strikes (Dramatized Adaptation),Ilona Andrews,10 hrs and 25 mins,English
2,Creature Feature,"Joe Hill, Grady Hendrix, Josh Malerman, Paul T...",7 hrs and 41 mins,English
3,Elon Musk,Walter Isaacson,20 hrs and 27 mins,English
4,Salt Kiss,Sierra Simone,9 hrs and 35 mins,English
...,...,...,...,...
485,Cutting for Stone,Abraham Verghese,23 hrs and 54 mins,English
486,Pretty Girls,Karin Slaughter,20 hrs,English
487,The Rascor Plains,Phil Tucker,24 hrs and 10 mins,English
488,The 10X Rule,Grant Cardone,7 hrs and 24 mins,English


Other processes can be done on this data.