# Notebook 1: Get list of albums to draw recommendations from

### Introduction

This notebook scrapes a list of around 1000 critically acclaimed albums from Metacritic (or 11 pages of results), in order to get a list of albums to feed into the Spotify API (notebook 2). *This site has since been updated since I scraped it at the end of May 2020, so this code needs to be adapted to work again.*

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time, os
import pickle
import sys

sys.setrecursionlimit(1000000) #to allow pickling

### Scrape Metacritic

The first 11 pages are cleanly scraped (and then pickled, stored in the data folder). This is ordered by critic ratings.

In [None]:
#Start out by getting list of URLS to scrape.
list_urls = []
for i in range(0, 11):
    full_url = 'https://www.metacritic.com/browse/albums/score/metascore/all/filtered' + '?page=' + str(i)
    list_urls.append(full_url)

In [None]:
def get_data(urls):
    user_agent = {'User-agent': 'Mozilla/5.0'}
    response = []
    for i in urls:
        response.append(requests.get(i, headers = user_agent))
    return response

In [None]:
#Scrape URLs and save source.
soups = get_data(list_urls)

In [None]:
def make_soup(response_list):
    soup = []
    for i in response_list:
        soup.append(BeautifulSoup(i.text, 'html5lib'))
    return soup

In [None]:
#Convert source into Beautiful Soup
soup_source = make_soup(soups)

In [None]:
# Pickle source data, to avoid needing to scrape the site again.
# with open('data/meta_critic_source.pickle', 'wb') as to_write:
#     pickle.dump(soup_source, to_write)

### Extract Album Information and Save as DataFrame

In [None]:
# extracts album information from Beautiful Soup
def clean_html(soup):
    critic_rating = []
    album_title = []
    artist_name = []
    user_rating = []
    release_date = []
    all_lists = [artist_name, user_rating, release_date]
    for i in soup.findAll('div', {'class': 'basic_stat product_title'}):
        album_title.append(i.text.strip())
    for i in soup.findAll('div', {'class': 'metascore_w small release positive'}):
        critic_rating.append(i.text)
        for j,k in enumerate(i.findNext().findAll('span', {'class': 'data'})):
            all_lists[j%len(all_lists)].append(k.text)
    artist_name2 = all_lists[0]
    user_rating2 = all_lists[1]
    release_date2 = all_lists[2]

    return pd.DataFrame(list(zip(album_title[:100], artist_name2, critic_rating, user_rating2, release_date2)),
                       columns = ['Album_Title', 'Artist_Name', 'Critic_Rating', 'User_Rating', 'Release_Date'])

In [None]:
# combines information for each album into a dataframe
def clean_page(soup_list):
    df = pd.concat([clean_html(i) for i in soup_list])
    return df

In [None]:
# get dataframe of all album information
long_critics_df = clean_page(soup_source)

In [None]:
critics_df = long_critics_df.reset_index(drop = True)

In [None]:
critics_df

In [None]:
# Save album dataframe as pickle file. 
# with open('data/critics_df_all.pickle', 'wb') as to_write:
#     pickle.dump(critics_df, to_write)