## Site - https://redditfavorites.com/books

In [150]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import altair as alt

In [151]:
def extractor(page_number):
    url = 'https://redditfavorites.com/books?page=' + str(page_number)
    x = requests.get(url)
    soup = BeautifulSoup(x.content, 'html.parser')
    # Gather tags
    title_tags = soup.find_all('h2', {'class': 'title'})
    author_tags = soup.find_all('div', {'class': 'subtitle'})
    mention_tags = soup.find_all('span', {'class': 'color-purple'})
    
    # Parsing collected tags for extracts
    title_content = []
    for tag in title_tags:
        title_content.append(tag.text)
    
    titles = title_content[::4]
    popularity_scores = title_content[1::4]

    authors = []
    for tag in author_tags:
        authors.append(tag.text)
    
    mentions_content = []
    for tag in mention_tags:
        mentions_content.append(tag.text)
    
    mentions_content = mentions_content[1:]
    comments = mentions_content[1::3]
    average_upvotes = mentions_content[2::3]
    
    # append to master list
    titles_x.append(titles)
    popularity_scores_x.append(popularity_scores)
    authors_x.append(authors)
    comments_x.append(comments)
    average_upvotes_x.append(average_upvotes)

In [152]:
titles_x = []
popularity_scores_x = []
authors_x = []
comments_x = []
average_upvotes_x = []

for i in [1,2]:
    extractor(i)

In [153]:
flat_titles = [item for sublist in titles_x for item in sublist]
flat_popularity_scores = [item for sublist in popularity_scores_x for item in sublist]
flat_authors = [item for sublist in authors_x for item in sublist]
flat_comments = [item for sublist in comments_x for item in sublist]
flat_average_upvotes = [item for sublist in average_upvotes_x for item in sublist]

In [154]:
columns = ['title', 'popularity_score', 'author', 'comments_recieved', 'average_upvotes']
books = pd.DataFrame(columns=columns)

# fill the dataframe with extracted values
books['title'] = flat_titles
books['popularity_score'] = flat_popularity_scores
books['author'] = flat_authors
books['comments_recieved'] = flat_comments
books['average_upvotes'] = flat_average_upvotes

In [155]:
# clean columns
books['title'] = books['title'].apply(lambda x: x[1:-1].lower())
books['popularity_score'] = books['popularity_score'].apply(lambda x: x[19:-1])
books['author'] = books['author'].apply(lambda x: x.lower())
books['average_upvotes'] = books['average_upvotes'].astype(float)

In [156]:
# book category extraction
categories = ['business', 'design', 'drawing', 'economics', 'investing', 'meditation', 'people', 'personal_finance',
             'philosophy', 'programming', 'self_improvement', 'writing']

book_category = []

def book_category_extractor(category):
    url = 'https://redditfavorites.com/books?category_id=' + str(category)
    x = requests.get(url)
    soup = BeautifulSoup(x.content, 'html.parser')
    # Gather tags
    title_tags = soup.find_all('h2', {'class': 'title'})
    title_content = []
    for tag in title_tags:
        title_content.append(tag.text)
    titles = title_content[::4]
    for title in titles:
        book_category.append([title[1:-1].lower(),category])
    
for cat in categories:
    book_category_extractor(cat)
    
columns = ['book', 'category']
df_category = pd.DataFrame(columns=columns)

book = []
category = []
for combo in book_category:
    book.append(combo[0])
    category.append(combo[1])
    
df_category['book'] = book
df_category['category'] = category

In [157]:
book_with_category = books.merge(df_category, left_on='title', right_on='book', how='left')

In [159]:
book_with_category = book_with_category.drop('book', axis=1)

In [176]:
top_20 = book_with_category.iloc[:20,:]
alt.Chart(top_20, title='Top 20 popular books on Reddit').mark_bar().encode(
    x='popularity_score:Q',
    y="title",
    tooltip = "author"
).properties(width=600)

In [180]:
alt.Chart(book_with_category.category.value_counts().reset_index(), title='Popular reddit books by category').mark_bar().encode(
    x='category',
    y="index",
).properties(width=600)

In [175]:
alt.Chart(book_with_category, title='Popular books by category').mark_circle(size=60).encode(
    alt.X('popularity_score:Q', scale=alt.Scale(type='log', base=10)),
    alt.Y('average_upvotes:Q', scale=alt.Scale(type='log', base=10)),
    color='category',
    tooltip=['title']
).properties(width=800).interactive()

In [174]:
alt.Chart(book_with_category.loc[book_with_category['category']=='programming'], title='Popular programming books').mark_circle(size=60).encode(
    alt.X('popularity_score:Q', scale=alt.Scale(type='log', base=10)),
    alt.Y('average_upvotes:Q', scale=alt.Scale(type='log', base=10)),
    tooltip=['title']
).properties(width=800).interactive()