The goal of this project is to predict genre of books using summaries. 
We would then like to be able to get snippets of general summaries from a user and recommend them some books. 

In [1]:
from bs4 import BeautifulSoup
import requests
import urllib.parse
import json

First we gather data by scraping barnes and nobles using BeautifulSoup. 
We do the following:
- scrape to get a list of existing categories/genre of books 
- for each genre we will accumulate 40 different books: titles, authors and summaries 

<br> Note: we scrape 40 because barnes and nobles allows 40 per page and we think thats enough, if more is needed then we can always scrape the next page over... but we won't do that

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# Config Selenium
chrome_options = Options()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(options=chrome_options)

# Fetch page
browser.get('https://www.barnesandnoble.com/h/books/browse')
html = browser.page_source

# Parse page
soup = BeautifulSoup(html, 'html.parser')

# Result
categories_list = []
for ultag in soup.html.find_all('ul', {'class': 'entryCategories'}):
    for litag in ultag.find_all('li'):
        categories_list.append((str(litag).split('"')[1], litag.text.strip()))


In [3]:
categories_list[:5]

[('/b/books/activity-game-books/_/N-29Z8q8Z1gj3', 'Activity & Game Books'),
 ('/b/books/antiques-collectibles/_/N-29Z8q8Zs45', 'Antiques & Collectibles'),
 ('/b/books/art-architecture-photography/_/N-29Z8q8Zs9i',
  'Art, Architecture & Photography'),
 ('/b/books/awards/_/N-29Z8q8Z1d6q', 'Awards'),
 ('/b/books/bibles-christianity/_/N-29Z8q8Zsj2', 'Bibles & Christianity')]

The following function will make a list of tuple (best seller url, genre)

In [19]:
def make_best_seller(list):
    
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(options=chrome_options)

    best_seller_url = []
    for link in list:
        url = 'https://www.barnesandnoble.com' + link[0]
        browser.get(url)
        html = browser.page_source

        soup = BeautifulSoup(html, 'html.parser')

        
        heading_list = soup.html.find_all('h2')
        for heading in heading_list:
            if heading.text == 'Bestsellers':
                best_seller_url.append((str(heading.find_next_siblings('a')).split('="')[2].split(';')[0], link[1]))
    return best_seller_url
 


In [20]:
best_seller_url = make_best_seller(categories_list)

In [21]:
best_seller_url

[('/b/books/activity-game-books/_/N-1fZ29Z8q8Z1gj3', 'Activity & Game Books'),
 ('/b/books/antiques-collectibles/_/N-1fZ29Z8q8Zs45?Ns=P_Sales_Rank" onclick',
  'Antiques & Collectibles'),
 ('/b/books/art-architecture-photography/_/N-1fZ29Z8q8Zs9i?Ns=P_Sales_Rank" onclick',
  'Art, Architecture & Photography'),
 ('/b/books/bibles-christianity/_/N-1fZ29Z8q8Zsj2?Ns=P_Sales_Rank" onclick',
  'Bibles & Christianity'),
 ('/b/books/biography/_/N-1fZ29Z8q8Zsoc?Ns=P_Sales_Rank" onclick',
  'Biography'),
 ('/b/books/business/_/N-1fZ29Z8q8Zt82?Ns=P_Sales_Rank" onclick', 'Business'),
 ('/b/books/computers/_/N-1fZ29Z8q8Zug4?Ns=P_Sales_Rank" onclick',
  'Computers'),
 ('/b/books/cookbooks-food-wine/_/N-1fZ29Z8q8Zy3b?Ns=P_Sales_Rank" onclick',
  'Cookbooks, Food & Wine'),
 ('/b/books/crafts-hobbies/_/N-1fZ29Z8q8Z1f43?Ns=P_Sales_Rank" onclick',
  'Crafts & Hobbies'),
 ('/b/books/current-affairs-politics/_/N-1fZ29Z8q8Z16st?Ns=P_Sales_Rank" onclick',
  'Current Affairs & Politics'),
 ('/b/books/diet-hea

We're going to make two dictionaries:
- one where keys are genres and values are urls of 40 books for that genre. 
- the other where keys are genres and values will be the triple (titles, author, overview/summary of the book of that genre) 


In [22]:
temp = []
for category in best_seller_url:
    temp.append(category[1])

temp = set(temp)
dictionary_bestseller_url = dict.fromkeys(temp, [])
dictionary_bestseller = dict.fromkeys(temp, [])
    

In [23]:
dictionary_bestseller_url

{'Travel': [],
 'Philosophy': [],
 'Romance': [],
 'Study Aids & Test Prep': [],
 'Literature': [],
 'Mystery & Crime': [],
 'New Age & Alternative Beliefs': [],
 'Home & Garden': [],
 'Art, Architecture & Photography': [],
 'Crafts & Hobbies': [],
 'Psychology': [],
 'Diet, Health & Fitness': [],
 'Education': [],
 'Fiction': [],
 'Humor': [],
 'Sports': [],
 'Bibles & Christianity': [],
 'Business': [],
 'Pets': [],
 'History': [],
 'Activity & Game Books': [],
 'Religion': [],
 'Music, Film & Performing Arts': [],
 'Science Fiction & Fantasy': [],
 'Graphic Novels & Comics': [],
 'Antiques & Collectibles': [],
 'Self-Help & Relationships': [],
 'Current Affairs & Politics': [],
 'Cookbooks, Food & Wine': [],
 'Computers': [],
 'Nature': [],
 'Poetry': [],
 'Law': [],
 'Reference': [],
 'Medicine & Nursing': [],
 'Social Sciences': [],
 'Parenting & Family': [],
 'Science & Technology': [],
 'Biography': []}

The following function will add to our bestseller url dictionary the url of all the books in a specific genre

In [24]:
def get_data(list,dict):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(options=chrome_options)
    
    for link in list:
        url = 'https://www.barnesandnoble.com' + link[0] + '?Ns=P_Sales_Rank&Nrpp=40'
        browser.get(url)
        html = browser.page_source

        soup = BeautifulSoup(html, 'html.parser')
        
        temp_list = []
        for oltag in soup.html.find_all('ol', attrs = {'class' : 'product-shelf-list'}):
            for divs in oltag.find_all('div', attrs = {'class' : 'product-shelf-title'}): 
                    temp_list.append(divs.find('a'))
        
        for item in temp_list:
            dict[link[1]].append(str(item).split('"')[3])            
        

In [25]:
get_data(best_seller_url, dictionary_bestseller_url)

In [30]:
dictionary_bestseller_url

{'Travel': ['/w/dungeons-dragons-players-handbook-wizards-rpg-team/1110379209;jsessionid=F8224542336EAD3E6CCD7F66F64E2393.prodny_store01-atgap18?ean=9780786965601',
  '/w/tashas-cauldron-of-everything-wizards-rpg-team/1137572070;jsessionid=F8224542336EAD3E6CCD7F66F64E2393.prodny_store01-atgap18?ean=9780786967025',
  '/w/card-night-will-roya/1138007352;jsessionid=F8224542336EAD3E6CCD7F66F64E2393.prodny_store01-atgap18?ean=9780762473519',
  '/w/queens-gambit-walter-s-tevis/1102811989;jsessionid=F8224542336EAD3E6CCD7F66F64E2393.prodny_store01-atgap18?ean=9780593314654',
  '/w/the-answer-is-alex-trebek/1136854888;jsessionid=F8224542336EAD3E6CCD7F66F64E2393.prodny_store01-atgap18?ean=9781982157999',
  '/w/guinness-world-records-2021-guinness-world-records/1136012500;jsessionid=F8224542336EAD3E6CCD7F66F64E2393.prodny_store01-atgap18?ean=9781913484002',
  '/w/van-richtens-guide-to-ravenloft-wizards-rpg-team/1138879859;jsessionid=F8224542336EAD3E6CCD7F66F64E2393.prodny_store01-atgap18?ean=9780

In [12]:
def get_real_data(url_list, dict1, dict2):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(options=chrome_options)
    
    for genre in url_list:
        for link in dict1[genre[1]]:
            url = 'https://www.barnesandnoble.com' + str(link) 
            browser.get(url)
            html = browser.page_source

            soup = BeautifulSoup(html, 'html.parser')
            
            title = soup.find('h1')
            title = str(title).split('>')[1].split('<')[0]
            author = soup.find('span', attrs = {'id' : 'key-contributors'})
            author = str(author).split('>')[2].split('<')[0]
            overview = soup.find('div', attrs = {'class': 'overview-cntnt'})
            overview = overview.text.strip()
            dict2[genre[1]].append((title, author, overview))
            
    

In [13]:
get_real_data(best_seller_url, dictionary_bestseller_url, dictionary_bestseller)

KeyboardInterrupt: 