In [58]:
import re
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
pd.set_option("display.max_rows", 201)



# Scraping

This is a nice wrapper for getting the soup object from a url. 

In [4]:
def get_soup(url):
    """
    Small wrapper around urllib and BeautifulSoup. Takes a url
    and produces the soup object. Includes a header so that 
    http errors are less likely.
    """
    hdr = {"User-Agent": "Mozilla/5.0"}
    req = Request(url, headers=hdr)
    page = urlopen(req)
    soup = BeautifulSoup(page, "html")
    return soup


def combine_to_df(
    artist_list, album_list, rank_list, url, year_list=None, label_list=None
):
    df = pd.DataFrame({"artist": artist_list, "album": album_list,})

    df['rank'] = rank_list 
    if year_list:
        df["year"] = year_list

    if label_list:
        df["label"] = label_list

    df["reviewer_url"] = url

    return df

## Pitchfork

In [5]:
url = (
    "https://pitchfork.com/features/lists-and-guides/the-200-best-albums-of-the-2010s/"
)


soup = get_soup(url)

In [6]:
artist = []
album = []
year = []
for tag in soup.find_all("h2"):
    artist.append(tag.contents[0][:-2])
    album.append(tag.contents[1].contents[0])
    year.append(tag.contents[2][2:-1])

# cleaning
year[117] = "2013"
year[135] = "2015"

pitchfork_df = combine_to_df(
    artist_list=artist,
    album_list=album,
    rank_list=[200 - n for n in range(200)],
    year_list=year,
    url=url,
)

In [7]:
pitchfork_df.head()

Unnamed: 0,artist,album,rank,year,reviewer_url
0,Ratking,So It Goes,200,2014,https://pitchfork.com/features/lists-and-guide...
1,Wu Lyf,Go Tell Fire to the Mountain,199,2011,https://pitchfork.com/features/lists-and-guide...
2,Jean Grae / Quelle Chris,Everything’s Fine,198,2018,https://pitchfork.com/features/lists-and-guide...
3,Fatima Al Qadiri,Genre-Specific Xperience,197,2011,https://pitchfork.com/features/lists-and-guide...
4,Portal,Vexovoid,196,2013,https://pitchfork.com/features/lists-and-guide...


## billboard

In [8]:
url = "https://www.billboard.com/articles/news/list/8543722/best-albums-of-the-2010s-top-100"

soup = get_soup(url)

In [9]:
billboard_rank = []
artist = []
album = []
year = []

for tag in soup.find_all("strong"):
    billboard_rank.append(tag.text.split(".")[0])
    artist.append(tag.text.split(". ", 1)[1].split(",", 1)[0])
    album.append(tag.text.split(". ", 1)[1].split(",", 1)[1].split("(")[0])
    year.append(tag.text.split(". ", 1)[1].split(",", 1)[1].split("(")[1][:-1])

billboard_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=billboard_rank,
    year_list=year,
    url=url,
)

In [10]:
# Small tweaks
billboard_df.iloc[0, 3] = "2018"
billboard_df.iloc[44, 0] = "Tyler, the Creator"
billboard_df.iloc[44, 1] = "IGOR"

In [11]:
billboard_df.head()

Unnamed: 0,artist,album,rank,year,reviewer_url
0,Lady Gaga & Bradley Cooper,A Star Is Born Soundtrack,100,2018,https://www.billboard.com/articles/news/list/8...
1,Lady Antebellum,Need You Now,99,2010,https://www.billboard.com/articles/news/list/8...
2,Japandroids,Celebration Rock,98,2012,https://www.billboard.com/articles/news/list/8...
3,Porter Robinson,Worlds,97,2014,https://www.billboard.com/articles/news/list/8...
4,Ed Sheeran,x,96,2014,https://www.billboard.com/articles/news/list/8...


## Stereogum

In [12]:
url = "https://www.stereogum.com/featured/best-albums-of-the-2010s-list/"

soup = get_soup(url)

In [13]:
def h2_if_nonempty(tag):
    if tag.contents:
        if "h2" in tag.name:
            return True
    else:
        return False

In [14]:
stereogum_rank = []
artist = []
album = []
label = []
year = []
for tag in soup.find_all(h2_if_nonempty):
    stereogum_rank.append(tag.contents[0].contents[0])
    artist.append(tag.contents[1])
    album.append(tag.contents[2].contents[0])
    label.append(tag.contents[4].contents[0].split(",")[0])
    year.append(tag.contents[4].contents[0].split(",")[1])

# Cleaning
artist = [x[1:-3] for x in artist]
label = [x[1:] for x in label]
year = [x[1:-1] for x in year]

stereogum_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=stereogum_rank,
    year_list=year,
    url=url,
)

In [15]:
stereogum_df.head()

Unnamed: 0,artist,album,rank,year,reviewer_url
0,Future,Pluto,100,2012,https://www.stereogum.com/featured/best-albums...
1,Car Seat Headrest,Teens Of Denial,99,2016,https://www.stereogum.com/featured/best-albums...
2,PUP,The Dream Is Over,98,2016,https://www.stereogum.com/featured/best-albums...
3,Courtney Barnett,"Sometimes I Sit And Think, And Sometimes I Jus...",97,2015,https://www.stereogum.com/featured/best-albums...
4,Colleen Green,I Want To Grow Up,96,2015,https://www.stereogum.com/featured/best-albums...


## Consequence of Sound 

In [16]:
url =  "https://consequenceofsound.net/2019/11/top-albums-of-the-2010s/full-post/"

soup = get_soup(url)

In [17]:
cos_rank = []
artist = []
album = []
year = []

for tag in soup.find_all('h2')[2:-1]:
    cos_rank.append(tag.contents[0].split('.')[0])
    artist.append(tag.contents[0].split('.', 1)[1][1:-3])
    album.append(tag.contents[1].contents[0])
    year.append(tag.contents[2][2:-1])

In [18]:
cos_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=cos_rank,
    year_list=year,
    url=url,
)

# fixes
cos_df.iloc[67, 3] = '2015'
cos_df.iloc[83, 3] = '2016'
cos_df.iloc[89, 3] = '2012'

In [19]:
cos_df

Unnamed: 0,artist,album,rank,year,reviewer_url
0,PJ Harvey,Let England Shake,100,2011,https://consequenceofsound.net/2019/11/top-alb...
1,Savages,Silence Yourself,99,2013,https://consequenceofsound.net/2019/11/top-alb...
2,Destroyer,Kaputt,98,2011,https://consequenceofsound.net/2019/11/top-alb...
3,Ariana Grande,"thank you, next",97,2019,https://consequenceofsound.net/2019/11/top-alb...
4,Bon Iver,"22, A Million",96,2016,https://consequenceofsound.net/2019/11/top-alb...
5,Oneohtrix Point Never,Replica,95,2012,https://consequenceofsound.net/2019/11/top-alb...
6,Tom Waits,Bad as Me,94,2011,https://consequenceofsound.net/2019/11/top-alb...
7,BROCKHAMPTON,Iridescence,93,2018,https://consequenceofsound.net/2019/11/top-alb...
8,Caribou,Our Love,92,2014,https://consequenceofsound.net/2019/11/top-alb...
9,Courtney Barnett,"Sometimes I Sit and Think, and Sometimes I Jus...",91,2015,https://consequenceofsound.net/2019/11/top-alb...


## Time

In [20]:
url =  "https://time.com/5725768/best-albums-2010s-decade/"

soup = get_soup(url)

In [21]:
tag_list = [tag for tag in soup.find_all("strong")]
tag_list.remove(tag_list[-1])

artist = []
album = []
year = []

for tag in tag_list:
    artist.append(tag.text.split(',')[0].strip())
    album.append(tag.text.split(',')[1].split('(')[0].strip())
    year.append(tag.text.split(',')[1].split('(')[1].strip()[:-1])

In [22]:
time_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=10,
    year_list=year,
    url=url,
)

In [23]:
time_df

Unnamed: 0,artist,album,rank,year,reviewer_url
0,Fiona Apple,The Idler Wheel…,10,2012,https://time.com/5725768/best-albums-2010s-dec...
1,Miguel,Kaleidoscope Dream,10,2012,https://time.com/5725768/best-albums-2010s-dec...
2,Beauty Pill,Beauty Pill Describes Things As They Are,10,2015,https://time.com/5725768/best-albums-2010s-dec...
3,Carly Rae Jepsen,E•MO•TION,10,2015,https://time.com/5725768/best-albums-2010s-dec...
4,Beyoncé,Lemonade,10,2016,https://time.com/5725768/best-albums-2010s-dec...
5,Leonard Cohen,You Want It Darker,10,2016,https://time.com/5725768/best-albums-2010s-dec...
6,Miranda Lambert,The Weight of These Wings,10,2016,https://time.com/5725768/best-albums-2010s-dec...
7,Solange,A Seat At the Table,10,2016,https://time.com/5725768/best-albums-2010s-dec...
8,Kendrick Lamar,DAMN.,10,2017,https://time.com/5725768/best-albums-2010s-dec...
9,Ozuna,Aura,10,2018,https://time.com/5725768/best-albums-2010s-dec...


## Paste Magazine

In [24]:
urls = [
    "https://www.pastemagazine.com/articles/2019/10/best-albums-of-the-2010s-paste.html",
    "https://www.pastemagazine.com/articles/2019/10/best-albums-of-the-2010s-paste.html?p=2",
    "https://www.pastemagazine.com/articles/2019/10/best-albums-of-the-2010s-paste.html?p=3",
    "https://www.pastemagazine.com/articles/2019/10/best-albums-of-the-2010s-paste.html?p=4",
]

In [25]:
paste_rank = []
artist = []
album = []
year = []

tag_list = []
for url in urls:
    soup = get_soup(url)

    for tag in soup.find_all("b", class_="big"):
        tag_list.append(tag)

tag_list = tag_list[1:]

for tag in tag_list:
    paste_rank.append(tag.text.split(".", 1)[0])
    artist.append(tag.text.split(".", 1)[1].split(":", 1)[0][1:])
    album.append(tag.text.split(":", 1)[1][1:-7])
    year.append(tag.text[-5:-1])

paste_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=paste_rank,
    year_list=year,
    url=urls[0],
)

In [26]:
paste_df.head()

Unnamed: 0,artist,album,rank,year,reviewer_url
0,Drake,Take Care,100,2011,https://www.pastemagazine.com/articles/2019/10...
1,Japanese Breakfast,Soft Sounds from Another Planet,99,2017,https://www.pastemagazine.com/articles/2019/10...
2,Nick Cave & The Bad Seeds,Skeleton Tree,98,2016,https://www.pastemagazine.com/articles/2019/10...
3,Deafheaven,Sunbather,97,2013,https://www.pastemagazine.com/articles/2019/10...
4,"Tyler, The Creator",Flower Boy,96,2017,https://www.pastemagazine.com/articles/2019/10...


## Genius

In [27]:
url = "https://genius.com/a/the-genius-communitys-100-best-albums-of-the-2010s"

soup = get_soup(url)

In [28]:
genius_rank = []
artist = []
album = []

for tag in soup.find_all("div", class_="g_list-item-header"):
    genius_rank.append(tag.text.replace("\n", "")[10:].split(" ", 1)[0])
    artist.append(tag.text.replace("\n", "")[10:].split(" ", 1)[1][17:].split(" – ")[0])
    album.append(tag.text.replace("\n", "")[10:].split(" ", 1)[1][17:].split(" – ")[1])

year_list = [
    '2015'
]

In [29]:
genius_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=genius_rank,
    url=url,
)

In [30]:
genius_df.head()

Unnamed: 0,artist,album,rank,reviewer_url
0,Justin Bieber,Purpose,100,https://genius.com/a/the-genius-communitys-100...
1,Katy Perry,Teenage Dream,99,https://genius.com/a/the-genius-communitys-100...
2,Bon Iver,"Bon Iver, Bon Iver",98,https://genius.com/a/the-genius-communitys-100...
3,Billie Eilish,"WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?",97,https://genius.com/a/the-genius-communitys-100...
4,Schoolboy Q,Oxymoron,96,https://genius.com/a/the-genius-communitys-100...


## Rolling Stone

In [31]:
urls = [
  "https://www.rollingstone.com/music/music-lists/best-albums-2010s-ranked-913997/",
    "https://www.rollingstone.com/music/music-lists/best-albums-2010s-ranked-913997/carly-rae-jepsen-emotion-album-917470/"
] 

soup_list = [get_soup(url) for url in urls]

In [32]:
rs_rank = []
album = []
artist = []

for soup in soup_list:
    for tag in soup.find_all("header", class_="c-list__header"):
        rs_rank.append(tag.span.text.strip())
        artist.append(tag.h3.text.strip().split(", ", 1)[0])
        if len(tag.h3.text.strip().split(", ", 1)) == 1:
            album.append('Hamilton')
        else:
            album.append(tag.h3.text.strip().split(", ", 1)[1][1:-1])

In [33]:
rs_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=genius_rank,
    url=urls[0],
)

In [34]:
rs_df

Unnamed: 0,artist,album,rank,reviewer_url
0,The War on Drugs,A Deeper Understanding,100,https://www.rollingstone.com/music/music-lists...
1,Lana Del Rey,Ultraviolence,99,https://www.rollingstone.com/music/music-lists...
2,John Prine,The Tree of Forgiveness,98,https://www.rollingstone.com/music/music-lists...
3,Rich Gang,Tha Tour Part 1,97,https://www.rollingstone.com/music/music-lists...
4,Low Cut Connie,Call Me Sylvia,96,https://www.rollingstone.com/music/music-lists...
5,Future,Monster,95,https://www.rollingstone.com/music/music-lists...
6,Sheer Mag,Need to Feel Your Love,94,https://www.rollingstone.com/music/music-lists...
7,U2,Songs of Innocence,93,https://www.rollingstone.com/music/music-lists...
8,Vampire Weekend,Father of the Bride,92,https://www.rollingstone.com/music/music-lists...
9,Tierra Whack,Whack World,91,https://www.rollingstone.com/music/music-lists...


# Combining Data 

In [35]:
def reformat(df_list):
    """
    Trims any external whitespaces on all strings and turns rank and year into integers.
    """
    for df in df_list:

        # trim any whitespace around the strings in each column
        for column in df.columns:
            if df[column].dtype == 'object':
                df[column] = df[column].apply(lambda x: x.strip())

        # Make years and ranks integers
        
#         if "year" in df.columns:
#             df["year"] = df["year"].astype('Int32')
            
        df["rank"] = df["rank"].astype(int)

    return

In [36]:
df_list = [
    pitchfork_df,
    billboard_df,
    stereogum_df,
    cos_df,
    time_df,
    paste_df,
    genius_df,
    rs_df
]

reformat(df_list)

In [37]:
albums = pd.concat(df_list, sort=False)

In [38]:
# albums['year'] = albums['year'].astype('Int64')
albums.reset_index()

Unnamed: 0,index,artist,album,rank,year,reviewer_url
0,0,Ratking,So It Goes,200,2014,https://pitchfork.com/features/lists-and-guide...
1,1,Wu Lyf,Go Tell Fire to the Mountain,199,2011,https://pitchfork.com/features/lists-and-guide...
2,2,Jean Grae / Quelle Chris,Everything’s Fine,198,2018,https://pitchfork.com/features/lists-and-guide...
3,3,Fatima Al Qadiri,Genre-Specific Xperience,197,2011,https://pitchfork.com/features/lists-and-guide...
4,4,Portal,Vexovoid,196,2013,https://pitchfork.com/features/lists-and-guide...
...,...,...,...,...,...,...
802,95,David Bowie,Blackstar,5,,https://www.rollingstone.com/music/music-lists...
803,96,Taylor Swift,Red,4,,https://www.rollingstone.com/music/music-lists...
804,97,Kendrick Lamar,To Pimp a Butterfly,3,,https://www.rollingstone.com/music/music-lists...
805,98,Beyoncé,Lemonade,2,,https://www.rollingstone.com/music/music-lists...


In [39]:
artist_list = albums.artist.unique()

## Fixing missing years

Some albums have an incorrectly listed year, while others might have a missing year altogether. The following blocks aims to search the rest of the dataframe for the most common year.

In [40]:
def get_year(album, df_list=df_list):

    # ideally a singleton

    years = []
    for df in df_list:
        found_year_in_df = False

        if album in df["album"].values:
            if "year" in df.columns:
                found_year_in_df = True

        if found_year_in_df:
            years.append(df.loc[df["album"] == album, "year",].values[0])
    return years


def most_common(lst):
    """ Taken from 
    https://stackoverflow.com/questions/1518522/find-the-most-common-element-in-a-list
    """
    if lst:
        return max(set(lst), key=lst.count)
    else:
        return None

In [41]:
# temporary to deal with nan's
albums['year'] = albums['year'].astype(float)

# 
albums['most_common_year'] = albums.apply(
    lambda row: most_common(get_year(row['album'])) if np.isnan(row['year']) else row['year'],
    axis=1
)


In [42]:
# albums.to_csv("semi_cleaned_data.csv", index=False)

In [43]:
# albums = pd.read_csv("semi_cleaned_data.csv")

Next, use a fuzzyfinder tool to make sure no albums appear more than once.

## Fixing Album Names

Some albums appear with a different name. For example, David Bowie's _Blackstar_.

In [53]:
def get_albums_from_artist(artist, df=albums):
    return list(set(df.loc[df.artist == artist, 'album'].to_list()))


In [None]:
# Write a for-loop over the artist.
# Get the list of albums.
# fuzzyfind on them to distill the number of distinct albums.

for artist in artist_list:
    

In [45]:
get_albums_from_artist('David Bowie')

Unnamed: 0,artist,album,rank,year,reviewer_url,most_common_year
163,David Bowie,Blackstar,37,2016.0,https://pitchfork.com/features/lists-and-guide...,2016.0
48,David Bowie,Blackstar,49,2016.0,https://www.billboard.com/articles/news/list/8...,2016.0
85,David Bowie,Blackstar,15,2016.0,https://www.stereogum.com/featured/best-albums...,2016.0
83,David Bowie,★,17,2016.0,https://consequenceofsound.net/2019/11/top-alb...,2016.0
86,David Bowie,Blackstar,14,2016.0,https://www.pastemagazine.com/articles/2019/10...,2016.0
78,David Bowie,★ (Blackstar),22,,https://genius.com/a/the-genius-communitys-100...,
95,David Bowie,Blackstar,5,,https://www.rollingstone.com/music/music-lists...,2016.0


# Data Exploration

In [46]:
# Do a word bubble of this
albums['artist'].unique()

array(['Ratking', 'Wu Lyf', 'Jean Grae / Quelle Chris',
       'Fatima Al Qadiri', 'Portal', 'Downtown Boys', 'Titus Andronicus',
       'Lil Peep', 'Kelela', 'Kate Bush', 'Huerco S.', 'Miranda Lambert',
       'Pusha-T', 'Iceage', 'Various Artists', 'Hailu Mergia',
       'Soccer Mommy', 'Elysia Crampton', 'G.L.O.S.S.', 'Jamila Woods',
       'Chvrches', 'Shabazz Palaces', 'Savages', 'Margo Price',
       'Gil Scott-Heron', 'Jessie Ware', 'Sharon Van Etten',
       'Hurray for the Riff Raff', 'Popcaan', 'JPEGMAFIA', 'Skee Mask',
       'Paramore', 'Julianna Barwick', '(Sandy) Alex G', 'Nicki Minaj',
       'Jay Som', 'Hop Along', 'Meek Mill', 'Sturgill Simpson',
       'The 1975', 'Noname', 'Pallbearer', 'Sleigh Bells', 'Jenny Lewis',
       '21 Savage / Metro Boomin', 'Mica Levi', 'Purple Mountains',
       'Arctic Monkeys', 'Lady Gaga', 'Playboi Carti', 'Mac DeMarco',
       'Nicolas Jaar', 'The National', 'Sheer Mag', 'Chief Keef',
       'PJ Harvey', 'Weyes Blood', 'Fennesz', 'Ric

Make a new column for a score conversion.