The goal of this project is to predict genre of books using summaries. 
<br>We would then like to be able to get snippets of general summaries from a user and recommend them some books. 

In this notebook we will just scrape and gather our data

In [1]:
from bs4 import BeautifulSoup
import requests
import urllib.parse
import json

First we gather data by scraping barnes and nobles using BeautifulSoup. 
We do the following:
- scrape to get a list of existing categories/genre of books 
- for each genre we will accumulate 40 different books: titles, authors and summaries(if it exists)
<br>if a summary does not exist, if we are missing just a few then we should be able to mannually replace by making some scrape functions. Otherwise :(


Note: be aware that we only take 40 because barnes and nobles website is terrible and will max out at 40 books per page... we can always go on to the next page but we think 40 per genre is enough. 

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Config Selenium
chrome_options = Options()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)

# Fetch page
browser.get('https://www.barnesandnoble.com/h/books/browse')
html = browser.page_source

# Parse page
soup = BeautifulSoup(html, 'html.parser')

# Result
categories_list = []
for ultag in soup.html.find_all('ul', {'class': 'entryCategories'}):
    for litag in ultag.find_all('li'):
        categories_list.append((str(litag).split('"')[1], litag.text.strip()))


In [3]:
categories_list[:5]

[('/b/books/activity-game-books/_/N-29Z8q8Z1gj3', 'Activity & Game Books'),
 ('/b/books/antiques-collectibles/_/N-29Z8q8Zs45', 'Antiques & Collectibles'),
 ('/b/books/art-architecture-photography/_/N-29Z8q8Zs9i',
  'Art, Architecture & Photography'),
 ('/b/books/awards/_/N-29Z8q8Z1d6q', 'Awards'),
 ('/b/books/bibles-christianity/_/N-29Z8q8Zsj2', 'Bibles & Christianity')]

The following function will make a list of tuple (best seller url, genre)

In [4]:
def make_best_seller(list):
    
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(options=chrome_options)

    best_seller_url = []
    for link in list:
        url = 'https://www.barnesandnoble.com' + link[0]
        browser.get(url)
        html = browser.page_source

        soup = BeautifulSoup(html, 'html.parser')

        
        heading_list = soup.html.find_all('h2')
        for heading in heading_list:
            if heading.text == 'Bestsellers':
                best_seller_url.append((str(heading.find_next_siblings('a')).split('="')[2].split(';')[0].split('?')[0], link[1]))
    return best_seller_url
 


In [5]:
best_seller_url = make_best_seller(categories_list)

In [6]:
best_seller_url

[('/b/books/activity-game-books/_/N-1fZ29Z8q8Z1gj3', 'Activity & Game Books'),
 ('/b/books/antiques-collectibles/_/N-1fZ29Z8q8Zs45',
  'Antiques & Collectibles'),
 ('/b/books/art-architecture-photography/_/N-1fZ29Z8q8Zs9i',
  'Art, Architecture & Photography'),
 ('/b/books/bibles-christianity/_/N-1fZ29Z8q8Zsj2', 'Bibles & Christianity'),
 ('/b/books/biography/_/N-1fZ29Z8q8Zsoc', 'Biography'),
 ('/b/books/business/_/N-1fZ29Z8q8Zt82', 'Business'),
 ('/b/books/computers/_/N-1fZ29Z8q8Zug4', 'Computers'),
 ('/b/books/cookbooks-food-wine/_/N-1fZ29Z8q8Zy3b', 'Cookbooks, Food & Wine'),
 ('/b/books/crafts-hobbies/_/N-1fZ29Z8q8Z1f43', 'Crafts & Hobbies'),
 ('/b/books/current-affairs-politics/_/N-1fZ29Z8q8Z16st',
  'Current Affairs & Politics'),
 ('/b/books/diet-health-fitness/_/N-1fZ29Z8q8Z11ip', 'Diet, Health & Fitness'),
 ('/b/books/education/_/N-1fZ29Z8q8Zzmg', 'Education'),
 ('/b/books/fiction/_/N-1fZ29Z8q8Z10h8', 'Fiction'),
 ('/b/books/graphic-novels-comics/_/N-1fZ29Z8q8Zucb',
  'Graphic N

We're going to make a list to fill with urls for each genre.


In [7]:
bestseller_url_list = []
for category in best_seller_url:
    bestseller_url_list.append([category[1]])
    

The following function will add to our bestseller url dictionary the url of all the books in a specific genre

In [8]:
def get_data(list):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(options=chrome_options)
    
    temp_list = []
    for link in list:
        url = 'https://www.barnesandnoble.com' + link[0] + '?Ns=P_Sales_Rank&Nrpp=40'
        browser.get(url)
        html = browser.page_source

        soup = BeautifulSoup(html, 'html.parser')
           
        for oltag in soup.html.find_all('ol', attrs = {'class' : 'product-shelf-list'}):
            for divs in oltag.find_all('div', attrs = {'class' : 'product-shelf-title'}): 
                temp_list.append((divs.find('a'),link[1]))
    
    better_list = []   
    for item in temp_list:
        better_list.append((str(item[0]).split('"')[3], item[1]))        
    return better_list

In [9]:
temp = get_data(best_seller_url)

In [10]:
for i in range(len(bestseller_url_list)):
    for item in temp:
        if item[1] == bestseller_url_list[i][0]:
            bestseller_url_list[i].append(item[0])

In [11]:
def get_real_data(url_list):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(options=chrome_options)
    
    data_list = []
    for i,lists in enumerate(url_list):
        for link in lists:
            url = 'https://www.barnesandnoble.com' + link 
            browser.get(url)
            html = browser.page_source

            soup = BeautifulSoup(html, 'html.parser')
            
            #get title
            title = soup.find('h1')
            title = str(title).split('>')[1].split('<')[0]
            
            #get author
            author = soup.find('span', attrs = {'id' : 'key-contributors'})
            author = str(author).split('>')[2].split('<')[0]
            
            #get summary if there is one
            overview = soup.find('div', attrs = {'class': 'overview-cntnt'})
            if overview:
                overview = overview.text.strip()
            else: 
                overview = 'None'
            
            data_list.append((title, author, overview, lists[0]))
            
    return data_list
            
    

In [12]:
data_list = get_real_data(bestseller_url_list)

WebDriverException: Message: unknown error: net::ERR_NAME_NOT_RESOLVED
  (Session info: headless chrome=91.0.4472.164)


In [None]:
import pandas as pd

In [None]:
genre_col = []
title_col = []
author_col = []
summary_col = []
for item in bestseller_list:
    genre_col.append(item[3])
    title_col.append(item[0])
    author_col.append(item[1])
    summary_col.append(item[2])
    
df_list = [genre_col, title_col, author_col, summary_col]
    

In [None]:
df = pd.DataFrame(df_list, columns = ['genre', 'title', 'author', 'summary'])

In [None]:
df = df.drop_duplicates()

In [None]:
df.head()

In [None]:
df.to_csv('book_dataframe.csv')