## Web Scraping
*** Go to "Rate Your Music" website and retrieve a chart of the top albums of all time ***

In [1]:
# Import Libraries
from splinter import Browser
from bs4 import BeautifulSoup
import time
import pandas as pd
from sqlalchemy import create_engine


*** Extract ***
* Use Splinter to navigate the "Top Albums of all Time" - 40 Albums per web page
* Use Beautiful Soup to extract individual Album details - Title, Album, Rating and Spotify link
* Capture intial results in Pandas 

In [2]:
from webdriver_manager.chrome import ChromeDriverManager
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True)

[WDM] - Current google-chrome version is 88.0.4324
[WDM] - Get LATEST driver version for 88.0.4324


[WDM] - Driver [C:\Users\bentl\.wdm\drivers\chromedriver\win32\88.0.4324.96\chromedriver.exe] found in cache


In [3]:
# Rate Your Music web site - Top Albums of all time chart
url = 'https://rateyourmusic.com/charts/top/album/all-time/'
browser.visit(url)

# Initialise a list to store dictionaries for each album scapped
album_list = []

In [4]:
# Iterate through all pages of the "Top Albums of all time"
for x in range(125):
    # HTML object
    html = browser.html
    # Parse HTML with Beautiful Soup
    soup = BeautifulSoup(html, 'html.parser')
    # Retrieve all elements that contain album information
    albums = soup.find_all('div', class_='topcharts_itembox chart_item_release')
        
    # Iterate through each album
    for album in albums:
        # Use Beautiful Soup's find() method to navigate and retrieve album attributes
        try:
            album_dict = {
                "position" : album.find('div', class_='topcharts_position').text,
                "title" : album.find('a', class_='release').text,
                "artist" : album.find('a', class_='artist').text,
                "release_date" : album.find('div', class_='topcharts_item_releasedate').text.strip('\t\r\n'),
                "genres" : album.find('div', class_='topcharts_item_genres_container').text.strip('\t\r\n'),
                "secondarygenres" : album.find('div', class_='topcharts_item_secondarygenres_container').text.strip('\t\r\n'),
                "avg_rating" : album.find('span', class_='topcharts_stat topcharts_avg_rating_stat').text,
                "rating_count" : album.find('span', class_='topcharts_stat topcharts_ratings_stat').text,
                "review_count" : album.find('span', class_='topcharts_stat topcharts_reviews_stat').text,
                "spotify_link" : album.find('a', class_='ui_media_link_btn ui_media_link_btn_spotify').attrs.get("href") }
        # Capture error when Spotify Link is not available       
        except AttributeError:
            continue
        
        album_list.append(album_dict)

    # Click the 'Next' button on each page
    try:
        browser.click_link_by_partial_text('Next')
          
    except:
        print("Scraping Complete")

    time.sleep(1)


Scraping Complete


In [5]:
# Convert the list of album dictionaries into a Pandas dataframe
top_albums = pd.DataFrame(album_list)
top_albums.head()

Unnamed: 0,position,title,artist,release_date,genres,secondarygenres,avg_rating,rating_count,review_count,spotify_link
0,1.0,OK Computer,Radiohead,16 June 1997 \n \n ...,"Alternative Rock, \nArt Rock",,4.23,64317,1530,https://open.spotify.com/album/7dxKtc08dYeRVHt...
1,2.0,Wish You Were Here,Pink Floyd,12 September 1975 \n \n ...,"Progressive Rock, \nArt Rock",Psychedelic Rock,4.29,44399,948,https://open.spotify.com/album/0bCAjiUamIFqKJs...
2,3.0,In the Court of the Crimson King,King Crimson,10 October 1969 \n \n ...,"Progressive Rock, \nArt Rock","Symphonic Prog, \nFree Improvisation, \nJazz-R...",4.3,40667,833,https://open.spotify.com/album/5wec5BciMpDMzlE...
3,4.0,Kid A,Radiohead,3 October 2000 \n \n ...,"Art Rock, \nExperimental Rock, \nElectronic","Ambient, \nElectronic",4.21,53511,730,https://open.spotify.com/album/19RUXBFyM4PpmrL...
4,6.0,The Dark Side of the Moon,Pink Floyd,23 March 1973 \n \n ...,"Art Rock, \nProgressive Rock","Psychedelic Rock, \nSpace Rock",4.2,53385,1534,https://open.spotify.com/album/4LH4d3cOWNNsVw4...


*** Transform ***
* Remove end of line characters
* Set the index to the position
* Sort by position in the chart

In [6]:
# Remove end of line characters
top_albums['release_date'].replace(r'\s+|\\n', ' ', regex=True, inplace=True)
top_albums['genres'].replace(r'\s+|\\n', ' ', regex=True, inplace=True)
top_albums['secondarygenres'].replace(r'\s+|\\n', ' ', regex=True, inplace=True)
# Remove the full stop at the end of position
top_albums['position'] = top_albums['position'].str.replace('[^\w\s]','')
# Convert to integer
top_albums['rating_count'] = top_albums['rating_count'].str.replace(',','').astype(int)
top_albums['review_count'] = top_albums['review_count'].str.replace(',','').astype(int)
# Clean the data by dropping duplicates and setting the index
top_albums.drop_duplicates("position", inplace=True)
top_albums.set_index("position", inplace=True)


In [7]:
top_albums.head(15)

Unnamed: 0_level_0,title,artist,release_date,genres,secondarygenres,avg_rating,rating_count,review_count,spotify_link
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,OK Computer,Radiohead,16 June 1997,"Alternative Rock, Art Rock",,4.23,64317,1530,https://open.spotify.com/album/7dxKtc08dYeRVHt...
2,Wish You Were Here,Pink Floyd,12 September 1975,"Progressive Rock, Art Rock",Psychedelic Rock,4.29,44399,948,https://open.spotify.com/album/0bCAjiUamIFqKJs...
3,In the Court of the Crimson King,King Crimson,10 October 1969,"Progressive Rock, Art Rock","Symphonic Prog, Free Improvisation, Jazz-Rock,...",4.3,40667,833,https://open.spotify.com/album/5wec5BciMpDMzlE...
4,Kid A,Radiohead,3 October 2000,"Art Rock, Experimental Rock, Electronic","Ambient, Electronic",4.21,53511,730,https://open.spotify.com/album/19RUXBFyM4PpmrL...
6,The Dark Side of the Moon,Pink Floyd,23 March 1973,"Art Rock, Progressive Rock","Psychedelic Rock, Space Rock",4.2,53385,1534,https://open.spotify.com/album/4LH4d3cOWNNsVw4...
7,Abbey Road,The Beatles,26 September 1969,Pop Rock,"Psychedelic Pop, Progressive Pop, Art Pop",4.26,40898,928,https://open.spotify.com/album/0ETFjACtuP2ADo6...
8,The Velvet Underground & Nico,The Velvet Underground,12 March 1967,"Art Rock, Experimental Rock","Proto-Punk, Noise Rock, Garage Rock, Psychedel...",4.24,42268,922,https://open.spotify.com/album/4xwx0x7k6c5VuTh...
9,To Pimp a Butterfly,Kendrick Lamar,15 March 2015,"Conscious Hip Hop, West Coast Hip Hop, Jazz Rap","Political Hip Hop, Neo-Soul, Funk, Poetry, Exp...",4.25,38136,333,https://open.spotify.com/album/7ycBtnsMtyVbbwT...
10,The Rise and Fall of Ziggy Stardust and the Sp...,David Bowie,16 June 1972,"Glam Rock, Pop Rock","Art Rock, Rock Opera",4.26,36209,710,https://open.spotify.com/album/48D1hRORqJq52qs...
11,Revolver,The Beatles,5 August 1966,"Pop Rock, Psychedelic Pop","Psychedelic Rock, Psychedelic Pop",4.23,39800,1229,https://open.spotify.com/album/3PRoXYsngSwjEQW...


*** Load ***
* Connect to music_db PostgresQL database
* write the dataframe to the album table 

In [8]:
# PotgreSQL connection requirements
from config import userid
from config import password

In [9]:
# Connect to database
rds_connection_string = f"{userid}:{password}@localhost:5432/music_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [10]:
# Confirm tables
engine.table_names()

['spotify_csv', 'spotify_api', 'album']

In [11]:
# wite the dataframe to the database
top_albums.to_sql(name='album', con=engine, if_exists='append', index=True)

In [19]:
# Confirm records in database
from sqlalchemy.orm import Session
from sqlalchemy.ext.automap import automap_base

Base = automap_base()
Base.prepare(engine, reflect=True)
# Base.classes.keys()

Album = Base.classes.album

session = Session(engine)

first_row = session.query(Album).first()
first_row.__dict__

{'_sa_instance_state': <sqlalchemy.orm.state.InstanceState at 0x2be68c6cc18>,
 'release_date': '16 June 1997 ',
 'title': 'OK Computer',
 'review_count': 1530,
 'avg_rating': Decimal('4.23'),
 'genres': 'Alternative Rock, Art Rock',
 'artist': 'Radiohead',
 'position': 1,
 'spotify_link': 'https://open.spotify.com/album/7dxKtc08dYeRVHt3p9CZJn',
 'rating_count': 64317,
 'secondarygenres': ''}

In [21]:
# Also dump to CSV
top_albums.to_csv('resources/albums.csv')