## Billboard top charts

Get the best selling albums of a certain week from the [billboard 200](https://www.billboard.com/charts/billboard-200) page. Create a pandas dataframe to store the information.

* title (the album name)
* artist
* rank


In [4]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

from typing import Iterable

def get_billboard_top_albums_dataframe(date: str='2001-06-02', count: int=5) -> pd.DataFrame:
    
    url = 'https://www.billboard.com/charts/billboard-200/' + date
    html_file = requests.get(url).text
    soup = BeautifulSoup(html_file, 'lxml')
    billboard_DF = pd.DataFrame(columns = ["Rank", "Artist", "Title"])
    

    first_place = soup.find('div', class_='container container--no-background chart-number-one')
    first_place_details = first_place.find('div', class_='chart-number-one__details')
    first_place_artist = first_place_details.find('div', class_='chart-number-one__artist').text.replace('\n','')
    first_place_title = first_place_details.find('div', class_='chart-number-one__title').text.replace('\n','')
    billboard_DF = billboard_DF.append({"Rank":'1', "Artist": first_place_artist, "Title":first_place_title}, ignore_index = True)

    chart_item_div_tag = soup.find_all('div', class_= 'chart-list-item')
    for tag in chart_item_div_tag:
        rank = tag['data-rank']
        artist = tag['data-artist']
        title = tag['data-title']
        billboard_DF = billboard_DF.append({"Rank": rank, "Artist": artist, "Title": title}, ignore_index = True)

    billboard_DF = billboard_DF.iloc[0:count,:]
    return billboard_DF

top_5_albums = get_billboard_top_albums_dataframe(count=5, date='2018-06-02')
top_5_albums


Unnamed: 0,Rank,Artist,Title
0,1,BTS,Love Yourself: Tear
1,2,Post Malone,beerbongs & bentleys
2,3,Lil Baby,Harder Than Ever
3,4,Five Finger Death Punch,And Justice For None
4,5,Cardi B,Invasion Of Privacy


## Music brainz API
Code that connects to the music brainz music database API and searches for the information about the given album. 

In [5]:
import itertools
import unicodedata
import re

_remove_accents = lambda input_str: ''.join((c for c in unicodedata.normalize('NFKD', input_str) if not unicodedata.combining(c)))
_clean_string = lambda s: set(re.sub(r'[^\w\s]', '', _remove_accents(s)).lower().split())
_jaccard = lambda set1, set2: float(len(set1 & set2)) / float(len(set1 | set2))


def search(entity_type:str, query: str):
    return requests.get(
        'http://musicbrainz.org/ws/2/{entity}/'.format(entity=entity_type),
        params={
            'fmt': 'json',
            'query': query
        }
    ).json()


def get_release_url(artist: str, title: str):
    type_ = 'release'
    search_results = search(type_, '%s AND artist:%s' % (title, artist))
    
    artist = _clean_string(artist)
    title = _clean_string(title)

#     print("title = " + str(title) +' artist=' + str(artist))
    for item in search_results.get(type_+'s', []):
        names = list()
        for artists in item['artist-credit']:
            if 'artist' in artists:
                names.append(_clean_string(artists['artist']['name']))
                for alias in artists['artist'].get('aliases', {}):
                    names.append(_clean_string(alias.get('name', '')))
#         print('  title=' + str(_clean_string(item['title'])) + ' names=' + ', '.join(itertools.chain(*names)))

        if _jaccard(_clean_string(item['title']), title) > 0.5 and \
            (any(_jaccard(artist, name) > 0.3 for name in names) or len(names) == 0):
            return 'http://musicbrainz.org/ws/2/{type}/{id}/'.format(id=item['id'], type=type_)
    
    return None


In [6]:
# Your solution goes here

import urllib.request
import json

#top_5_albums = get_billboard_top_albums_dataframe(count=5, date='2018-06-02')
artist_title_pair = zip(top_5_albums['Artist'], top_5_albums['Title'])

inc_label = '?inc=artist-credits+labels+discids+recordings&fmt=json'
albumInfo_DF = pd.DataFrame(columns = ["Artist", "Title", "Disc Count", "Track Count"])

for pair in artist_title_pair:
    
    url = get_release_url(pair[0], pair[1])
    if (url is not None):
        useful_url = url[:-1] + inc_label
        print(useful_url)
        
        with urllib.request.urlopen(useful_url) as response:
            source = response.read()
    
        data = json.loads(source)
    
        disc_count = len(data['media'])
        track_count = 0
        disc_counter = 0
        while (disc_counter < disc_count):
            track_count += len(data['media'][disc_counter]['tracks'])
            disc_counter += 1
    
        albumInfo_DF = albumInfo_DF.append({"Artist":pair[0],"Title":pair[1],"Disc Count":disc_count,"Track Count":track_count}, ignore_index=True)
    
    else: 
        albumInfo_DF = albumInfo_DF.append({"Artist":pair[0],"Title":pair[1],"Disc Count":np.NaN,"Track Count":np.NaN}, ignore_index=True) 
        
albumInfo_DF

# Any NaN values in Disc Count and Track Count indicate the information was not available on Music Brainz API


http://musicbrainz.org/ws/2/release/7e3e6ec2-2194-4c74-a417-7b0b3ffb354a?inc=artist-credits+labels+discids+recordings&fmt=json
http://musicbrainz.org/ws/2/release/061e483c-7d6c-4864-ab94-546207952040?inc=artist-credits+labels+discids+recordings&fmt=json
http://musicbrainz.org/ws/2/release/0952fb25-b541-417a-948a-e821e2d51ad8?inc=artist-credits+labels+discids+recordings&fmt=json
http://musicbrainz.org/ws/2/release/76086324-51a9-4bf8-bf6f-93c5795839ec?inc=artist-credits+labels+discids+recordings&fmt=json


Unnamed: 0,Artist,Title,Disc Count,Track Count
0,BTS,Love Yourself: Tear,,
1,Post Malone,beerbongs & bentleys,1.0,18.0
2,Lil Baby,Harder Than Ever,1.0,17.0
3,Five Finger Death Punch,And Justice For None,1.0,16.0
4,Cardi B,Invasion Of Privacy,1.0,13.0
