## Web Scraping Pitchfork for Album Reviews

In [None]:
#import libraries
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

### *In order to acquire the html for the review links on Pitchfork from 2015 onwards, I scrolled down their review page until reaching the year 2015 and saved the html as a file.

In [None]:
#turn the html file into a beautiful soup object
filename = 'pitchfork_15-present.htm'
file = open(filename, 'r')
soup = BeautifulSoup(file, 'html.parser')


### Grab the individual links for each review and append them to a list

In [None]:
link_tags = soup.findAll('a', {'class': ['review__link']})
real_links = []
for tag in link_tags:
    real_links.append(tag.get('href'))

In [None]:
#save links
files = 'pitchfork_rev_links.pickle'
pickle.dump(real_links, open(files, 'wb'))

### Use the review links to scrape Pitchfork and gather each review's album, artist, score, genre and summary.

In [None]:
#create list for each attribute
album = []
artist = []
score = []
genres = []
summary = []
#loop through each review link and gather desired info
for i in tqdm(real_links):
    time.sleep(2)
    url = i
    html_page = requests.get(url)
    soup = BeautifulSoup(html_page.content, 'html.parser')
    album_tag = soup.find('h1', {'class': ['single-album-tombstone__review-title']})
    artist_tag = soup.find('ul', {'class': ['artist-links artist-list single-album-tombstone__artist-links']})
    score_tag = soup.find('span', {'class': ['score']})
    genres_tag = soup.find('ul', {'class': ['genre-list genre-list--before']})
    summary_tag = soup.find('div', {'class': ['review-detail__abstract']})
    try:
        album.append(album_tag.get_text())
    except AttributeError:
        album.append('null')
    try:
        artist.append(artist_tag.get_text())
    except AttributeError:
        artist.append('null')
    try:
        score.append(score_tag.get_text())
    except AttributeError:
        score.append('null')
    try:
        genres.append(genres_tag.get_text())
    except AttributeError:
        genres.append('null')
    try:
        summary.append(summary_tag.get_text())
    except AttributeError:
        summary.append('null')
    
print(album, artist, score, genres, summary)
    

### Save lists and a dataframe of all pitchfork info

In [None]:
with open('pitchfork_album.pickle', 'wb') as f:
    pickle.dump(album, f)
with open('pitchfork_artist.pickle', 'wb') as f:
    pickle.dump(artist, f)
with open('pitchfork_score.pickle', 'wb') as f:
    pickle.dump(score, f)
with open('pitchfork_genres.pickle', 'wb') as f:
    pickle.dump(genres, f)
with open('pitchfork_summary.pickle', 'wb') as f:
    pickle.dump(summary, f)

In [None]:
#create data frame
pitchfork_df = pd.DataFrame([album, artist, score, genres, summary]).transpose()
pitchfork_df.columns = ['Album', 'Artist', 'Score', 'Genres', 'Summary']
print(len(pitchfork_df))
pitchfork_df.head(100)

In [None]:
#save df to a pickle
pitchfork_df.to_pickle("pitchfork_df.pickle")