In [69]:
import requests
import urllib
import html2text as ht
import re
import pandas as pd
import numpy as np
import json
import string
from bs4 import BeautifulSoup
from collections import Counter

## Scrape Billboard for song titles

In [460]:
class DateTracker:
    
    days = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 
            7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31}
    
    def __init__(self, year = None, month = None, day = None):
        self.year = year
        self.month = month
        self.day = day
        
    def previous_week(self):
        
        if (self.year == 1958) and (self.month == 8) and (0 < self.day <= 7):
            return
        
        # if move to previous month
        if self.day - 7 <= 0:
            
            if self.month != 1:
                
                self.month -= 1
                self.day += self.days[self.month] - 7
            
            # if move to previous year
            else:
                
                self.year -= 1
                self.month = 12
                self.day += self.days[self.month] - 7
                
        else:
            
            self.day -= 7
            
    def as_date(self):
        
        return "%04d-%02d-%02d" % (self.year, self.month, self.day)
        
        
class Song:
    
    def __init__(self, artist, name, rank, peakpos, weeksonchart, date):
        self.artist = artist
        self.name = name
        self.rank = rank
        self.peakpos = peakpos
        self.weeksonchart = weeksonchart
        self.date = date
        
    def get_artist(self):
        return self.artist
    
    def get_name(self):
        return self.name
    
    def get_rank(self):
        return self.rank
    
    def get_peak_pos(self):
        return self.peakpos
    
    def get_weeks_on_chart(self):
        return self.weeksonchart
    
    def get_date(self):
        return self.date
    
    def list_form(self):
        return [self.artist, self.name, self.rank, self.peakpos,
                self.weeksonchart, self.date]
    
    def summary(self):
        print("\n".join([str(x) for x in [self.artist, self.name,
                                          self.rank, self.peakpos,
                                          self.weeksonchart, self.date]]))

In [516]:
def clean_text(text):
    bad = {"\\\\n": "\n", 
           "\n+": "\n", 
           "\n": " - ",
           "\[\]\(.{0,}?\)": "",
           "\[Play\]\(.{0,}?\)": "",
           "\[\s{0,5}-\s{0,5}Song\s{0,5}-\s{0,5}Lyrics\s{0,5}-\s{0,5}\]\(.{0,}?\)": ""}

    for x in bad:
        text = re.sub(x, bad[x], text, flags = re.I | re.S)
        
    return text

def clean_line(text):
    repl = {"weeks at no. 1": "",
            "(-\s{1,4}!)+": "",
            "(?<![A-z])! -": "",
            "_": "",
            "\[\s-": "", 
            "-\s\]": "",
            "\[.{0,}?\]|\(.{0,}?\)": "",
            "\s+": " ",
            "(-\s{0,5})+": "- ",}    
    
    for x in repl:
        text = re.sub(x, repl[x], text, flags = re.I | re.S)
        #print(text)
    return text

In [517]:
def get_songs_by_week(weeks = 100):
    
    filler = [" and ", " featuring ", " & ", " x ", " / "]
    path = "https://www.billboard.com/charts/hot-100"
    curdate = DateTracker(year = 2018, month = 8, day = 11)
    songs = []

    for j in range(weeks + 1):
        
        page = requests.get(path + "/" + curdate.as_date())
        if j % 20 == 0:
            print("We are on week {}, it is {}".format(j, curdate.as_date()))

        data = ht.html2text(str(page.content))
        data = clean_text(data)
        data = re.split("date search |in performance - |on chart - ", data, flags = re.I)[1:]

        #first one
        weeksoc = int(re.search("[0-9]{0,1}\s-\sweeks", 
                                re.sub("weeks at no. 1", "", data[0], flags = re.I | re.S),
                                flags = re.I | re.S).group()[:2])
        tmp = clean_line(data[1]).strip().split("-")
        name = tmp[0].strip()
        artist = tmp[1].strip()
        for k in filler:
            artist = re.sub(k, ", ", artist, flags = re.I | re.S)

        songs.append(Song(artist = artist, name = name, rank = 1,
                          peakpos = 1, weeksonchart = weeksoc, 
                          date = curdate.as_date()))


        #everything else (2 - 100)
        for i in data[2:]:
            #we know we have a rank
            if re.match("^[0-9]{0,3}", i.strip()).group() != "":
                i = clean_line(i)
                tmp = i.split("-")
                rank = tmp[0].strip()
                name = tmp[1].strip()
                artist = tmp[2].strip()
                for k in filler:
                    artist = re.sub(k, ", ", artist, flags = re.I | re.S)
                    
                #if these are empty they're debuts!
                try:
                    peakpos = int(re.search("[0-9]{0,3}\s-\speak position", i, flags = re.I | re.S).group().split(" ")[0])
                except AttributeError:
                    peakpos = ""
                try:
                    weeksoc = int(re.search("[0-9]{0,3}\s-\sweeks", i, flags = re.I | re.S).group().split(" ")[0])
                except AttributeError:
                    weeksoc = ""

                songs.append(Song(artist = artist, name = name, rank = rank,
                                  peakpos = peakpos, weeksonchart = weeksoc,
                                  date = curdate.as_date()))

        curdate.previous_week()
        
    return songs, curdate

In [528]:
songs, curdate = get_songs_by_week(weeks = 200)

We are on week 0, it is 2018-08-11
We are on week 20, it is 2018-03-24
We are on week 40, it is 2017-11-04
We are on week 60, it is 2017-06-17
We are on week 80, it is 2017-01-28
We are on week 100, it is 2016-09-10
We are on week 120, it is 2016-04-23
We are on week 140, it is 2015-12-04
We are on week 160, it is 2015-07-17
We are on week 180, it is 2015-02-27
We are on week 200, it is 2014-10-10


In [529]:
prep = np.array([x.list_form() for x in songs])
df = pd.DataFrame({
    "Artists": prep[:, 0],
    "Name": prep[:, 1],
    "Weekly rank": prep[:, 2],
    "Peak position": prep[:, 3],
    "Weeks on chart": prep[:, 4],
    "Week": prep[:, 5]
})

In [533]:
df.to_csv("Weekly_ranks.csv", header = True, index = False)

In [535]:
df.head()

Unnamed: 0,Artists,Name,Weekly rank,Peak position,Weeks on chart,Week
0,Drake,In My Feelings,1,1.0,5.0,2018-08-11
1,"6ix9ine, Nicki Minaj, Murda Beatz",FEFE,3,3.0,2.0,2018-08-11
2,"Maroon 5, Cardi B",Girls Like You,4,3.0,10.0,2018-08-11
3,"DJ Khaled, Justin Bieber, Chance The Rapper, Q...",No Brainer,5,,,2018-08-11
4,Post Malone,Better Now,6,5.0,14.0,2018-08-11


## Scrape Genius for lyrics, genre, etc.


Song name

Artist/Group

Weekly rank 

Peak rank

Year published

Genre

Writing Credits

Lyrics



In [None]:
def get_lyric_link(artists, name, both = False):
    
    client_token = "BrGsH3KoiMzSyCUClF4-TyzjrNfQfrr2-Q9bfK6Bhum1fquRgVf0rn-Pq6mr9Uyc"
    headers = {"Authorization": "Bearer " + client_token}
    link = "https://api.genius.com/search?q="
    space = "%20"
    name_repl = {"Beyonce": "Beyoncé", 
                 "Amine": "Aminé",
                 "D.R.A.M.": "DRAM",
                 "$ign": "\$ign"}

    artists = re.sub(",", "", artists)
    name = re.sub(",", "", name)
    if both:
        page = requests.get(link + re.sub(" ", space, name) +
                            space + re.sub(" ", space, artists), headers = headers)
    else:
        page = requests.get(link + re.sub(" ", space, name), headers = headers)
    page = json.loads(page.content)["response"]["hits"] #data
    check = [re.sub(",", "", x) for x in artists.split(" ") + name.split(" ") if x not in string.punctuation]
    for i in name_repl:
        if i in check:
            check.remove(i)
            check.append(name_repl[i])
    top = []
    #print(page, check)
    
    #print(link + re.sub(" ", space, name) +
    #                    space + re.sub(" ", space, artists), check, page)
    
    
    if len(page) == 1:
        return page[0]["result"]["path"]
    else:
        for i in range(len(page)):
            c = 0
            for j in check:
                if re.search(j, page[i]["result"]["full_title"], flags = re.I | re.S) != None:
                    c += 1
                    #print(c)
                    #print(page[i]["result"]["full_title"])
            #print(c, len(check))
            if c == len(check):
                return page[i]["result"]["path"]
            #for j in 
            #try:
            #    top.append(page[i]["result"]["stats"]["pageviews"])
            #except KeyError:
            #    top.append(-1)
        
    if both == False:
        return get_lyric_link(artists, name, True)
    
    top = []
    #if here, find most popular song
    for j in range(len(page)):
        try:
            top.append(page[i]["result"]["stats"]["pageviews"])
        except KeyError:
            top.append(-1)
    print(top)
    return page[np.argmax(top)]["result"]["path"]

    #
        
    #return None#page[np.argmax(top)]["result"]["path"]