In [16]:
import os
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

pd.set_option("display.max_rows", 500)

from utilities import DataCleaner


def ranks(n):
    return list(range(1, n + 1))[::-1]


def remove_parentheses(string):
    return string.split("(")[0].strip()

# Scraping

This is a nice wrapper for getting the soup object from a url. 

## Pitchfork

In [17]:
pitchfork_url = (
    "https://pitchfork.com/features/lists-and-guides/the-200-best-albums-of-the-2010s/"
)

pitchfork = DataCleaner(pitchfork_url, "Pitchfork")

In [18]:
pitchfork.create_tags("h2")
pitchfork.split(": ")
pitchfork.transpose()
# pitchfork.albums = pitchfork.apply(pitchfork.albums, remove_parentheses)
pitchfork.extract_year_from_paren("albums")
pitchfork.apply("albums", remove_parentheses)
pitchfork.create_df(ranks(200))

# pitchfork.print_data()
pitchfork.df

Unnamed: 0,artist,album,rank,genre,year,reviewer,reviewer_url
0,Ratking,So It Goes,200,,2014,Pitchfork,https://pitchfork.com/features/lists-and-guide...
1,Wu Lyf,Go Tell Fire to the Mountain,199,,2011,Pitchfork,https://pitchfork.com/features/lists-and-guide...
2,Jean Grae / Quelle Chris,Everything’s Fine,198,,2018,Pitchfork,https://pitchfork.com/features/lists-and-guide...
3,Fatima Al Qadiri,Genre-Specific Xperience,197,,2011,Pitchfork,https://pitchfork.com/features/lists-and-guide...
4,Portal,Vexovoid,196,,2013,Pitchfork,https://pitchfork.com/features/lists-and-guide...
5,Downtown Boys,Full Communism,195,,2015,Pitchfork,https://pitchfork.com/features/lists-and-guide...
6,Titus Andronicus,The Monitor,194,,2010,Pitchfork,https://pitchfork.com/features/lists-and-guide...
7,Lil Peep,Hellboy,193,,2016,Pitchfork,https://pitchfork.com/features/lists-and-guide...
8,Kelela,Cut 4 Me,192,,2013,Pitchfork,https://pitchfork.com/features/lists-and-guide...
9,Kate Bush,50 Words for Snow,191,,2011,Pitchfork,https://pitchfork.com/features/lists-and-guide...


In [19]:
len(pitchfork.df)

200

## billboard

In [20]:
bill_url = "https://www.billboard.com/articles/news/list/8543722/best-albums-of-the-2010s-top-100"
bill = DataCleaner(bill_url, "Billboard")
bill.create_tags("strong")
bill.strings.insert(25, "75. Charli XCX, True Romance (2013)")
bill.strings.insert(41, "59. Pusha T, Daytona (2018)")
bill.strings.insert(50, "50. Carly Rae Jepsen, E•MO•TION (2015)")
bill.trim("strings", chars="1234567890. ")
bill.extract_year_from_paren("strings")


bill.print_data()

Lady Gaga & Bradley Cooper, A Star Is Born Soundtrack (2018) 
Lady Antebellum, Need You Now (2010)
Japandroids, Celebration Rock (2012)
Porter Robinson, Worlds (2014)
Ed Sheeran, x (2014)
Chris Stapleton, Traveller (2015)
Nipsey Hussle, Victory Lap (2018)
P!nk, The Truth About Love (2012)
Ozuna, Odisea (2017)
Miley Cyrus, Bangerz (2013)
Against Me!, Transgender Dysphoria Blues (2014)
Childish Gambino, "Awaken, My Love!" (2016)
Sky Ferreira, Night Time, My Time (2013)
Brandi Carlile, By the Way, I Forgive You (2018)
Jonas Brothers, Happiness Begins (2019)
J. Cole, 2014 Forest Hills Drive (2014)
Rihanna, Loud (2010)
Travis Scott, Astroworld (2018)
The 1975, I Like It When You Sleep, For You Are So Beautiful Yet So Unaware of It (2016)
Mumford & Songs, Sigh No More (2010)
Camila Cabello, Camila (2017)
Courtney Barnett, Sometimes I Sit and Think and Sometimes I Just Sit (2015)
Adele, 25 (2015)
Arctic Monkeys, AM (2013)
Lil Uzi Vert, Luv Is Rage 2 (2017)
Charli XCX, True Romance (2013)
Twen

In [119]:
billboard_rank = []
artist = []
album = []
year = []


tag_texts = [tag.text for tag in soup.find_all("strong")] + [
    "75. Charli XCX, True Romance (2013)",
    "59. Pusha T, Daytona (2018)",
    "50. Carly Rae Jepsen, E•MO•TION (2015)",
]

for text in tag_texts:
    billboard_rank.append(text.split(".")[0])
    artist.append(text.split(". ", 1)[1].split(",", 1)[0])
    album.append(text.split(". ", 1)[1].split(",", 1)[1].split("(")[0])
    year.append(text.split(". ", 1)[1].split(",", 1)[1].split("(")[1][:-1])


billboard_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=billboard_rank,
    year_list=year,
    url=url,
)

In [120]:
# Small tweaks
billboard_df.iloc[0, 3] = "2018"
billboard_df.iloc[44, 0] = "Tyler, the Creator"
billboard_df.iloc[44, 1] = "IGOR"

In [121]:
billboard_df

Unnamed: 0,artist,album,rank,year,reviewer_url
0,Lady Gaga & Bradley Cooper,A Star Is Born Soundtrack,100,2018,https://www.billboard.com/articles/news/list/8...
1,Lady Antebellum,Need You Now,99,2010,https://www.billboard.com/articles/news/list/8...
2,Japandroids,Celebration Rock,98,2012,https://www.billboard.com/articles/news/list/8...
3,Porter Robinson,Worlds,97,2014,https://www.billboard.com/articles/news/list/8...
4,Ed Sheeran,x,96,2014,https://www.billboard.com/articles/news/list/8...
5,Chris Stapleton,Traveller,95,2015,https://www.billboard.com/articles/news/list/8...
6,Nipsey Hussle,Victory Lap,94,2018,https://www.billboard.com/articles/news/list/8...
7,P!nk,The Truth About Love,93,2012,https://www.billboard.com/articles/news/list/8...
8,Ozuna,Odisea,92,2017,https://www.billboard.com/articles/news/list/8...
9,Miley Cyrus,Bangerz,91,2013,https://www.billboard.com/articles/news/list/8...


In [122]:
len(billboard_df)

100

## Stereogum

In [123]:
url = "https://www.stereogum.com/featured/best-albums-of-the-2010s-list/"

soup = get_soup(url)

In [124]:
def h2_if_nonempty(tag):
    if tag.contents:
        if "h2" in tag.name:
            return True
    else:
        return False

In [125]:
stereogum_rank = []
artist = []
album = []
label = []
year = []
for tag in soup.find_all(h2_if_nonempty):
    stereogum_rank.append(tag.contents[0].contents[0])
    artist.append(tag.contents[1])
    album.append(tag.contents[2].contents[0])
    label.append(tag.contents[4].contents[0].split(",")[0])
    year.append(tag.contents[4].contents[0].split(",")[1])

# Cleaning
artist = [x[1:-3] for x in artist]
label = [x[1:] for x in label]
year = [x[1:-1] for x in year]

stereogum_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=stereogum_rank,
    year_list=year,
    url=url,
)

In [126]:
stereogum_df.head()

Unnamed: 0,artist,album,rank,year,reviewer_url
0,Future,Pluto,100,2012,https://www.stereogum.com/featured/best-albums...
1,Car Seat Headrest,Teens Of Denial,99,2016,https://www.stereogum.com/featured/best-albums...
2,PUP,The Dream Is Over,98,2016,https://www.stereogum.com/featured/best-albums...
3,Courtney Barnett,"Sometimes I Sit And Think, And Sometimes I Jus...",97,2015,https://www.stereogum.com/featured/best-albums...
4,Colleen Green,I Want To Grow Up,96,2015,https://www.stereogum.com/featured/best-albums...


## Consequence of Sound 

In [127]:
url =  "https://consequenceofsound.net/2019/11/top-albums-of-the-2010s/full-post/"

soup = get_soup(url)

In [128]:
cos_rank = []
artist = []
album = []
year = []

for tag in soup.find_all('h2')[2:-1]:
    cos_rank.append(tag.contents[0].split('.')[0])
    artist.append(tag.contents[0].split('.', 1)[1][1:-3])
    album.append(tag.contents[1].contents[0])
    year.append(tag.contents[2][2:-1])

In [129]:
cos_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=cos_rank,
    year_list=year,
    url=url,
)

# fixes
cos_df.iloc[67, 3] = '2015'
cos_df.iloc[83, 3] = '2016'
cos_df.iloc[89, 3] = '2012'

In [130]:
cos_df.head()

Unnamed: 0,artist,album,rank,year,reviewer_url
0,PJ Harvey,Let England Shake,100,2011,https://consequenceofsound.net/2019/11/top-alb...
1,Savages,Silence Yourself,99,2013,https://consequenceofsound.net/2019/11/top-alb...
2,Destroyer,Kaputt,98,2011,https://consequenceofsound.net/2019/11/top-alb...
3,Ariana Grande,"thank you, next",97,2019,https://consequenceofsound.net/2019/11/top-alb...
4,Bon Iver,"22, A Million",96,2016,https://consequenceofsound.net/2019/11/top-alb...


## Time

In [131]:
url =  "https://time.com/5725768/best-albums-2010s-decade/"

soup = get_soup(url)

In [132]:
tag_list = [tag for tag in soup.find_all("strong")]
tag_list.remove(tag_list[-1])

artist = []
album = []
year = []

for tag in tag_list:
    artist.append(tag.text.split(',')[0].strip())
    album.append(tag.text.split(',')[1].split('(')[0].strip())
    year.append(tag.text.split(',')[1].split('(')[1].strip()[:-1])

In [133]:
time_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=10,
    year_list=year,
    url=url,
)

In [134]:
time_df

Unnamed: 0,artist,album,rank,year,reviewer_url
0,Fiona Apple,The Idler Wheel…,10,2012,https://time.com/5725768/best-albums-2010s-dec...
1,Miguel,Kaleidoscope Dream,10,2012,https://time.com/5725768/best-albums-2010s-dec...
2,Beauty Pill,Beauty Pill Describes Things As They Are,10,2015,https://time.com/5725768/best-albums-2010s-dec...
3,Carly Rae Jepsen,E•MO•TION,10,2015,https://time.com/5725768/best-albums-2010s-dec...
4,Beyoncé,Lemonade,10,2016,https://time.com/5725768/best-albums-2010s-dec...
5,Leonard Cohen,You Want It Darker,10,2016,https://time.com/5725768/best-albums-2010s-dec...
6,Miranda Lambert,The Weight of These Wings,10,2016,https://time.com/5725768/best-albums-2010s-dec...
7,Solange,A Seat At the Table,10,2016,https://time.com/5725768/best-albums-2010s-dec...
8,Kendrick Lamar,DAMN.,10,2017,https://time.com/5725768/best-albums-2010s-dec...
9,Ozuna,Aura,10,2018,https://time.com/5725768/best-albums-2010s-dec...


## Paste Magazine

In [135]:
urls = [
    "https://www.pastemagazine.com/articles/2019/10/best-albums-of-the-2010s-paste.html",
    "https://www.pastemagazine.com/articles/2019/10/best-albums-of-the-2010s-paste.html?p=2",
    "https://www.pastemagazine.com/articles/2019/10/best-albums-of-the-2010s-paste.html?p=3",
    "https://www.pastemagazine.com/articles/2019/10/best-albums-of-the-2010s-paste.html?p=4",
]

In [136]:
paste_rank = []
artist = []
album = []
year = []

tag_list = []
for url in urls:
    soup = get_soup(url)

    for tag in soup.find_all("b", class_="big"):
        tag_list.append(tag)

tag_list = tag_list[1:]

for tag in tag_list:
    paste_rank.append(tag.text.split(".", 1)[0])
    artist.append(tag.text.split(".", 1)[1].split(":", 1)[0][1:])
    album.append(tag.text.split(":", 1)[1][1:-7])
    year.append(tag.text[-5:-1])

paste_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=paste_rank,
    year_list=year,
    url=urls[0],
)

In [137]:
paste_df.head()

Unnamed: 0,artist,album,rank,year,reviewer_url
0,Drake,Take Care,100,2011,https://www.pastemagazine.com/articles/2019/10...
1,Japanese Breakfast,Soft Sounds from Another Planet,99,2017,https://www.pastemagazine.com/articles/2019/10...
2,Nick Cave & The Bad Seeds,Skeleton Tree,98,2016,https://www.pastemagazine.com/articles/2019/10...
3,Deafheaven,Sunbather,97,2013,https://www.pastemagazine.com/articles/2019/10...
4,"Tyler, The Creator",Flower Boy,96,2017,https://www.pastemagazine.com/articles/2019/10...


## Genius

In [138]:
url = "https://genius.com/a/the-genius-communitys-100-best-albums-of-the-2010s"

soup = get_soup(url)

In [139]:
genius_rank = []
artist = []
album = []

for tag in soup.find_all("div", class_="g_list-item-header"):
    genius_rank.append(tag.text.replace("\n", "")[10:].split(" ", 1)[0])
    artist.append(tag.text.replace("\n", "")[10:].split(" ", 1)[1][17:].split(" – ")[0])
    album.append(tag.text.replace("\n", "")[10:].split(" ", 1)[1][17:].split(" – ")[1])

year_list = [
    '2015'
]

In [140]:
genius_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=genius_rank,
    url=url,
)

In [141]:
genius_df.head()

Unnamed: 0,artist,album,rank,reviewer_url
0,Justin Bieber,Purpose,100,https://genius.com/a/the-genius-communitys-100...
1,Katy Perry,Teenage Dream,99,https://genius.com/a/the-genius-communitys-100...
2,Bon Iver,"Bon Iver, Bon Iver",98,https://genius.com/a/the-genius-communitys-100...
3,Billie Eilish,"WHEN WE ALL FALL ASLEEP, WHERE DO WE GO?",97,https://genius.com/a/the-genius-communitys-100...
4,Schoolboy Q,Oxymoron,96,https://genius.com/a/the-genius-communitys-100...


## Rolling Stone

In [142]:
urls = [
  "https://www.rollingstone.com/music/music-lists/best-albums-2010s-ranked-913997/",
    "https://www.rollingstone.com/music/music-lists/best-albums-2010s-ranked-913997/carly-rae-jepsen-emotion-album-917470/"
] 

soup_list = [get_soup(url) for url in urls]

In [143]:
rs_rank = []
album = []
artist = []

for soup in soup_list:
    for tag in soup.find_all("header", class_="c-list__header"):
        rs_rank.append(tag.span.text.strip())
        artist.append(tag.h3.text.strip().split(", ", 1)[0])
        if len(tag.h3.text.strip().split(", ", 1)) == 1:
            album.append('Hamilton')
        else:
            album.append(tag.h3.text.strip().split(", ", 1)[1][1:-1])

In [144]:
rs_df = combine_to_df(
    album_list=album,
    artist_list=artist,
    rank_list=genius_rank,
    url=urls[0],
)

In [145]:
rs_df.head()

Unnamed: 0,artist,album,rank,reviewer_url
0,The War on Drugs,A Deeper Understanding,100,https://www.rollingstone.com/music/music-lists...
1,Lana Del Rey,Ultraviolence,99,https://www.rollingstone.com/music/music-lists...
2,John Prine,The Tree of Forgiveness,98,https://www.rollingstone.com/music/music-lists...
3,Rich Gang,Tha Tour Part 1,97,https://www.rollingstone.com/music/music-lists...
4,Low Cut Connie,Call Me Sylvia,96,https://www.rollingstone.com/music/music-lists...


# Combining Data 

## Reformatting

In [146]:
def reformat(df_list):
    """
    Trims any external whitespaces on all strings and turns rank and year into integers.
    """
    for df in df_list:

        # trim any whitespace around the strings in each column
        for column in df.columns:
            if df[column].dtype == 'object':
                df[column] = df[column].apply(lambda x: x.strip())

        # Make years and ranks integers
        
#         if "year" in df.columns:
#             df["year"] = df["year"].astype('Int32')
            
        df["rank"] = df["rank"].astype(int)

    return

In [147]:
df_list = [
    pitchfork_df,
    billboard_df,
    stereogum_df,
    cos_df,
    time_df,
    paste_df,
    genius_df,
    rs_df
]

df_urls = [df['reviewer_url'].iloc[0] for df in df_list]

url_names = [
    "Pitchfork",
    "Billboard",
    "Stereogum",
    "Consequence of Sound",
    "Time",
    "Paste",
    "Genius",
    "Rolling Stone"
]

reviewer_dict = dict(zip(df_urls, url_names))

reformat(df_list)

In [148]:
albums = pd.concat(df_list, sort=False)

In [149]:
# albums['year'] = albums['year'].astype('Int64')
albums.reset_index()

Unnamed: 0,index,artist,album,rank,year,reviewer_url
0,0,Ratking,So It Goes,200,2014,https://pitchfork.com/features/lists-and-guide...
1,1,Wu Lyf,Go Tell Fire to the Mountain,199,2011,https://pitchfork.com/features/lists-and-guide...
2,2,Jean Grae / Quelle Chris,Everything’s Fine,198,2018,https://pitchfork.com/features/lists-and-guide...
3,3,Fatima Al Qadiri,Genre-Specific Xperience,197,2011,https://pitchfork.com/features/lists-and-guide...
4,4,Portal,Vexovoid,196,2013,https://pitchfork.com/features/lists-and-guide...
...,...,...,...,...,...,...
805,95,David Bowie,Blackstar,5,,https://www.rollingstone.com/music/music-lists...
806,96,Taylor Swift,Red,4,,https://www.rollingstone.com/music/music-lists...
807,97,Kendrick Lamar,To Pimp a Butterfly,3,,https://www.rollingstone.com/music/music-lists...
808,98,Beyoncé,Lemonade,2,,https://www.rollingstone.com/music/music-lists...


In [150]:
artist_list = albums.artist.unique()

In [151]:
len(artist_list)

332

## Checking lengths

In [152]:
for df in df_list:
    print(len(df))

200
100
100
100
10
100
100
100


## Fixing Album Names

Some albums appear with a different name. For example, David Bowie's _Blackstar_.

In [153]:
def get_albums_from_artist(artist, df=albums):
    return list(set(df.loc[df.artist == artist, "album"].to_list()))


def create_fuzz_matrix(album_list):
    """
    Creates a confusion matrix using fuzzy finding.
    """
    fuzz_mat = np.zeros((len(album_list), len(album_list)), dtype=np.int)

    for i in range(len(album_list)):
        for j in range(i + 1, len(album_list)):
            fuzz_mat[i, j] = fuzz.partial_ratio(album_list[i], album_list[j])
            fuzz_mat[j, i] = fuzz_mat[i, j]
        fuzz_mat[i, i] = 100

    return fuzz_mat


def get_oddly_formatted_albums(albums_df, artist_list=artist_list, similarity=80):
    oddly_formatted_artists = []

    for artist in artist_list:
        album_list = get_albums_from_artist(artist, df=albums_df)
        if len(album_list) == 1:
            # no need to make changes since the album appears once.
            continue
        else:
            # get upper diagonal
            fuzz_mat = np.triu(create_fuzz_matrix(album_list), 1)

            # check if there are very similar things on the off-diagonal.
            # this will indicate the same album appears with slightly
            # different spelling.
            if fuzz_mat[fuzz_mat > similarity].size > 0:
                oddly_formatted_artists.append(artist)

    return oddly_formatted_artists

In [154]:
for artist in get_oddly_formatted_albums(albums):
    print(get_albums_from_artist(artist))

['I Like It When You Sleep, For You Are So Beautiful Yet So Unaware of It', 'I like it when you sleep…', 'I Like It When You Sleep, For You Are So Beautiful Yet So Unaware Of It', 'A Brief Inquiry Into Online Relationships']
['Teens Of Denial', 'Teens of Denial', 'Twin Fantasy (Face to Face)']
['Thank U, Next', 'Sweetener', 'thank you, next', 'Thank U Next', 'Dangerous Woman']
['Modern Vampires of the City', 'Father Of The Bride', 'Modern Vampires Of The City', 'Contra', 'Father of the Bride']
['Bon Iver', '22, A Million', 'Bon Iver, Bon Iver']
['Sremmlife', 'SremmLife']
['When I Get Home', 'A Seat At the Table', 'A Seat at the Table', 'A Seat At The Table']
['Smoke Ring for My Halo', 'Smoke Ring For My Halo']
['Invasion Of Privacy', 'Invasion of Privacy']
['I Love You Honeybear', 'I Love You, Honeybear']
['If You’re Reading This, It’s Too Late', 'If You’re Reading This It’s Too Late', 'Take Care', 'Nothing Was The Same', 'Nothing Was the Same']
['Sometimes I Sit and Think and Sometime

In [155]:
# Ultimately, the simplest thing for these was to write them in by
# hand. I would love ideas for another solution.
oddly_formatted_artists = get_oddly_formatted_albums(albums)
oddly_formatted_artists.append("Billie Eilish")
oddly_formatted_artists.append("Daughters")
oddly_formatted_artists.append("Carly Rae Jepsen")

oddly_formatted_albums = [
    ["I Like it when you sleep...", "A Brief Inquiry Into Online Relationships"],
    ["Teens of Denial", "Twin Fantasy (Face to Face)"],
    ["thank u, next", "Sweetener", "Dangerous Woman"],
    ["Father of the Bride", "Contra", "Modern Vampires of the City"],
    ["22, A Million", "Bon Iver"],
    ["SremmLife"],
    ["When I Get Home", "A Seat at the Table"],
    ["Smoke Ring for My Halo"],
    ["Invasion of Privacy"],
    ["I Love You, Honeybear"],
    ["Nothing Was the Same", "If You're Reading This, It's Too Late", "Take Care"],
    ["Sometimes I Sit and Think, and Sometimes I Just Sit"],
    ["good kid, m.A.A.d city", "DAMN.", "To Pimp a Butterfly"],
    ["Shaking the Habitual"],
    ["A Crow Looked at Me"],
    ["We Got It From Here... Thank You 4 Your Service"],
    ["Blackstar"],
    ["El Mal Querer"],
    ["Summertime '06", "Big Fish Theory"],
    ["Burn Your Fire for No Witness", "My Woman", "All Mirrors"],
    ["Norman Fucking Rockwell!", "Ultraviolence", "Born to Die"],
    ["Have One On Me"],
    ["Channel ORANGE", "Blonde", "Nostalgia Ultra"],
    ["The Idler Wheel..."],
    ["By the Way, I Forgive You"],
    ["Soft Sounds from Another Planet"],
    ["Rich Gang: Tha Tour Pt. 1"],
    ["When We All Fall Asleep, Where Do We Go?"],
    ["You Won't Get What You Want"],
    ["E•MO•TION"]    
]

odd_format_dict = dict(zip(oddly_formatted_artists, oddly_formatted_albums))

In [156]:
# Fix the album titles 

# For the one David Bowie album
albums.loc[albums['album'] == '★', "album"] = "Blackstar"

# Use fuzzyfinding to extract one album name for the weirdly formatted album titles.
albums['new_album_name'] = albums.apply(
    lambda row: process.extract(row['album'], odd_format_dict[row['artist']], limit=1)[0][0] if row['artist'] in oddly_formatted_artists else row['album'],
    axis=1
)

## Fixing missing years

Some albums have an incorrectly listed year, while others might have a missing year altogether. The following blocks aims to search the rest of the dataframe for the most common year.

In [157]:
def most_common(lst):
    """ Taken from 
    https://stackoverflow.com/questions/1518522/find-the-most-common-element-in-a-list
    """
    if lst:
        return max(set(lst), key=lst.count)
    else:
        return np.nan
    
def get_year(album, album_df=albums):
    
    years = list(album_df.loc[album_df["new_album_name"] == album, "year"].dropna().unique())
    
    if len(years) == 1:
        return years[0]
    elif len(years) > 1:
        return most_common(years)
    else:
        return np.nan


In [158]:
albums['new_year'] = albums['year']

albums.loc[albums['year'].isna(), 'new_year'] = albums.loc[albums['year'].isna(), "new_album_name"].apply(get_year)

Fill the remaining by hand.

In [159]:
albums['new_year'].isna().sum()

65

In [160]:
albums.loc[albums['new_year'].isna(), :]

Unnamed: 0,artist,album,rank,year,reviewer_url,new_album_name,new_year
4,Schoolboy Q,Oxymoron,96,,https://genius.com/a/the-genius-communitys-100...,Oxymoron,
5,Ariana Grande,Dangerous Woman,95,,https://genius.com/a/the-genius-communitys-100...,Dangerous Woman,
6,Isaiah Rashad,Cilvia Demo,94,,https://genius.com/a/the-genius-communitys-100...,Cilvia Demo,
7,alt-j,An Awesome Wave,93,,https://genius.com/a/the-genius-communitys-100...,An Awesome Wave,
9,Logic,Under Pressure,91,,https://genius.com/a/the-genius-communitys-100...,Under Pressure,
11,Kali Uchis,Isolation,89,,https://genius.com/a/the-genius-communitys-100...,Isolation,
12,Big Sean,Dark Sky Paradise,88,,https://genius.com/a/the-genius-communitys-100...,Dark Sky Paradise,
16,St. Vincent,MASSEDUCTION,84,,https://genius.com/a/the-genius-communitys-100...,MASSEDUCTION,
17,Daughters,You Won’t Get What You Want,83,,https://genius.com/a/the-genius-communitys-100...,You Won't Get What You Want,
18,Migos,Culture,82,,https://genius.com/a/the-genius-communitys-100...,Culture,


In [161]:
custom_year_dict = {
    "Oxymoron" : '2014',
    'Dangerous Woman' : '2016',
    'Cilvia Demo' : '2014',
    'An Awesome Wave' : '2012',
    'Under Pressure' : '2014',
    'Isolation' : '2018',
    'Dark Sky Paradise' : '2015',
    'MASSEDUCTION' : '2017',
    "You Won't Get What You Want" : '2018',
    'Culture' : '2017',
    'You Never Walk Alone' : '2017',
    'Twin Fantasy (Face to Face)' : '2018',
    'Stoney' : '2016',
    'Tetsuo & Youth' : '2015',
    'MADE' : '2016',
    'KIDS SEE GHOSTS' : '2018',
    'Night Visions' : '2012',
    'Blank Face LP' : '2016',
    'Konnichiwa' : '2016',
    'Rich Gang: Tha Tour Pt. 1' : '2014',
    'SATURATION III' : '2017',
    'Watching Movies With The Sound Off' : '2013',
    'Die Lit' : '2018',
    'Man On the Moon II: The Legend of Mr. Rager' : '2010',
    'TA1300' : '2018',
    'Teflon Don' : '2010',
    '1999' : '2012',
    'Rodeo' : '2015',
    'Piñata' : '2014',
    'My Krazy Life' : '2014',
    'Doo-Wops & Hooligans' : '2010',
    'Atrocity Exhibition' : '2016',
    'The Black Messiah' : '2014',
    'Because The Internet' : '2013',
    'DAYTONA' : '2018',
    'Watch The Throne' : '2011',
    'ANTi' : '2016',
    'A Deeper Understanding' : '2017',
    'The Tree of Forgiveness' : '2018',
    'Call Me Sylvia' : '2012',
    'Monster' : '2014',
    'Need to Feel Your Love' : '2017',
    'Songs of Innocence' : '2014',
    'Psychedelic Pill' : '2012',
    'Hardwired … to Self-Destruct' : '2016',
    'In Color' : '2015',
    'Chief' : '2011',
    'Mirror Traffic' : '2011',
    'The Highwomen' : '2019',
    'Nostalgia Ultra' : '2011',
    'On the Line' : '2019',
    'Blunderbuss' : '2012',
    'X 100Pre' : '2018',
    'Egypt Station' : '2018',
    'Harry Styles' : '2017',
    'American Teen' : '2017',
    'Wrecking Ball' : '2012',
    'Hamilton' : '2015',
    'Tempest' : '2012', 
    'Platinum' : '2014',
    "A Sailor’s Guide to Earth" : '2016',
    'Dark Matter' : '2017',
    'So Beautiful or So What' : '2011',
    'Interstate Gospel' : '2018'
}

In [162]:
albums.loc[albums['new_year'].isna(), "new_year"] = albums.loc[albums['new_year'].isna(), "new_album_name"].apply(lambda x: custom_year_dict[x])

In [163]:
albums['new_year'].isna().sum()

0

## Checking Artists

To check that artist names don't appear in multiple places with slightly different strings, the fuzz matrix will give us the similarity scores for all pairs of artist strings. Scores closer to 100 mean very similar, while close to 0 means dissimilar. Large off-diagonal terms mean two distinct strings that are highly similar.

In [164]:
np.sum(np.triu(create_fuzz_matrix(artist_list), 1) > 80)

44

Looks like 44 artists with similar names.

In [165]:
np.sum(np.triu(create_fuzz_matrix(artist_list), 1) > 99)

19

These 19 are highly similar. 

In [166]:
i_list, j_list = np.where(np.triu(create_fuzz_matrix(artist_list), 1) == 100)
i_list = list(i_list)
j_list = list(j_list)

In [167]:
duplicate_artists = list(set([artist_list[i] for i in i_list + j_list]))

In [168]:
duplicate_artists

['Future',
 'Future Islands',
 'Young Thug',
 'Arcade Fire',
 'Lady Gaga',
 'JAY-Z / Kanye West',
 'Jay-Z and Kanye West',
 'Kanye West',
 'U.S. Girls',
 'Arca',
 'Young Thug, Birdman, & Rich Homie Quan',
 'D’Angelo',
 'D’Angelo & the Vanguard',
 "D'Angelo and the Vanguard",
 'Girls',
 'JAY-Z & Kanye West',
 'Lady Gaga & Bradley Cooper',
 'Jay-Z & Kanye West',
 'Jay-Z',
 'Kanye and Jay-Z',
 "D'Angelo",
 'JAY-Z',
 'D’Angelo and the Vanguard',
 'Low Cut Connie',
 'D’Angelo And The Vanguard',
 'Low']

In [169]:
duplicate_dict = {
    "JAY-Z": ["Jay-Z", "JAY-Z", "Jay-z", "JAY-z"],
    "D'Angelo and the Vanguard": [
        "D’Angelo And The Vanguard",
        "D'Angelo and the Vanguard",
        "D’Angelo and the Vanguard",
        "D’Angelo & the Vanguard",
        "D'Angelo",
        'D’Angelo',
    ],
    "JAY-Z & Kanye West": [
        "JAY-Z & Kanye West",
        "Jay-Z & Kanye West",
        "Jay-Z and Kanye West",
        "JAY-Z / Kanye West",
    ],
}

In [170]:
# Make all duplicates the same. 
for artist in duplicate_dict.keys():
    albums.loc[albums["artist"].isin(duplicate_dict[artist]), "artist"] = artist

## Fixing "Hamilton"

The soundtrack to "Hamilton" appeared a few times with different attributions to the artist and different stylings.

In [171]:
hamiltons = albums.album.apply(lambda x: "Hamilton" in x) 
albums.loc[hamiltons, :]

Unnamed: 0,artist,album,rank,year,reviewer_url,new_album_name,new_year
53,Various Artists,Hamilton Original Cast Album,44,2015.0,https://www.billboard.com/articles/news/list/8...,Hamilton Original Cast Album,2015
73,Lin-Manuel Miranda,Hamilton: Original Broadway Cast Recording,27,2015.0,https://www.pastemagazine.com/articles/2019/10...,Hamilton: Original Broadway Cast Recording,2015
55,‘Hamilton’ Original Broadway Cast Recording,Hamilton,45,,https://www.rollingstone.com/music/music-lists...,Hamilton,2015


In [172]:
albums.loc[hamiltons, "artist"] = "'Hamilton' Original Broadway Cast Recording"
albums.loc[hamiltons, "new_album_name"] = "Hamilton"
albums.loc[hamiltons, "new_year"] = '2015'

## Get reviewer

In [173]:
albums["reviewer"] = albums.reviewer_url.apply(lambda x: reviewer_dict[x])

## Final Cleanup

In [174]:
albums.head()

Unnamed: 0,artist,album,rank,year,reviewer_url,new_album_name,new_year,reviewer
0,Ratking,So It Goes,200,2014,https://pitchfork.com/features/lists-and-guide...,So It Goes,2014,Pitchfork
1,Wu Lyf,Go Tell Fire to the Mountain,199,2011,https://pitchfork.com/features/lists-and-guide...,Go Tell Fire to the Mountain,2011,Pitchfork
2,Jean Grae / Quelle Chris,Everything’s Fine,198,2018,https://pitchfork.com/features/lists-and-guide...,Everything’s Fine,2018,Pitchfork
3,Fatima Al Qadiri,Genre-Specific Xperience,197,2011,https://pitchfork.com/features/lists-and-guide...,Genre-Specific Xperience,2011,Pitchfork
4,Portal,Vexovoid,196,2013,https://pitchfork.com/features/lists-and-guide...,Vexovoid,2013,Pitchfork


In [175]:
albums['year'] = albums['new_year']
albums['album'] = albums['new_album_name']

albums.drop(['new_year', 'new_album_name'], axis=1, inplace=True)

In [176]:
albums.reset_index(drop=True)

Unnamed: 0,artist,album,rank,year,reviewer_url,reviewer
0,Ratking,So It Goes,200,2014,https://pitchfork.com/features/lists-and-guide...,Pitchfork
1,Wu Lyf,Go Tell Fire to the Mountain,199,2011,https://pitchfork.com/features/lists-and-guide...,Pitchfork
2,Jean Grae / Quelle Chris,Everything’s Fine,198,2018,https://pitchfork.com/features/lists-and-guide...,Pitchfork
3,Fatima Al Qadiri,Genre-Specific Xperience,197,2011,https://pitchfork.com/features/lists-and-guide...,Pitchfork
4,Portal,Vexovoid,196,2013,https://pitchfork.com/features/lists-and-guide...,Pitchfork
...,...,...,...,...,...,...
805,David Bowie,Blackstar,5,2016,https://www.rollingstone.com/music/music-lists...,Rolling Stone
806,Taylor Swift,Red,4,2012,https://www.rollingstone.com/music/music-lists...,Rolling Stone
807,Kendrick Lamar,To Pimp a Butterfly,3,2015,https://www.rollingstone.com/music/music-lists...,Rolling Stone
808,Beyoncé,Lemonade,2,2016,https://www.rollingstone.com/music/music-lists...,Rolling Stone


In [177]:
albums['year'] = albums['year'].astype(int)
albums['rank'] = albums['rank'].astype(int)

In [178]:
albums.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 810 entries, 0 to 99
Data columns (total 6 columns):
artist          810 non-null object
album           810 non-null object
rank            810 non-null int64
year            810 non-null int64
reviewer_url    810 non-null object
reviewer        810 non-null object
dtypes: int64(2), object(4)
memory usage: 44.3+ KB


In [181]:
albums.to_csv(os.path.join('data', 'AOTD.csv'), index=False)

Take a look at this :

https://www.kaggle.com/pieca111/music-artists-popularity

Hard to get it working right. Would be neat though.