In [10]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import warnings
import re
import datetime as dt

### Fetching Lyrics from Genius 

In [2]:
token = open('token.txt','r').readline()

In [3]:
def fetch_song_info(artist,title,token):
    
    genius_api_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + token}
    
    def requires_formatting(strng):
        requires = False
        if any(x in strng for x in string.punctuation) | (strng.find('Feat') > -1):
            requires = True            
        return requires
    
    def format_string(strng):
        #remove featuring artists
        strng = strng.split('Feat')[0].strip()
        #remove special characters
        translator = str.maketrans('','',string.punctuation)
        formatted = strng.translate(translator)
        formatted = formatted.lower().strip()

        return formatted
    
    def search_path(artist,title,search_hits,format_=False):
        song_path = ''
        
        for hit in search_hits:
            a = hit['result']['primary_artist']['name']
            if format_:
                a = format_string(artist)
            t = hit['result']['title'] 
            
            if  (a in artist) and (title.lower() in t.lower()):
                song_path = hit['result']['path']
                break
        
        return song_path
    
    def get_path(artist,title,headers,format_=False):
        search_url = genius_api_url + '/search'  
        if format_:
            artist = format_string(artist)
            
        query = {'q':title + ' ' + artist}
        response = requests.get(search_url,data=query,headers=headers)
        search_hits = response.json()['response']['hits']
        
        song_path = search_path(artist,title,search_hits,format_)
            
        return response,song_path
    
    response,song_path = get_path(artist,title,headers)
    if song_path == '':
#         print(f'No search result found for {artist},{title}')
        requires_formatting = requires_formatting(artist)
#         print(f'{artist} requires formatting: {requires_formatting}')
        response,song_path = get_path(artist,title,headers,format_=True)
    
    return song_path

In [4]:
def scrape_lyrics(path):
    genius_web = 'https://genius.com/'
    page = requests.get(genius_web+path).content
    soup = bs(page,'html.parser')
    lyrics_content = soup.find('div',class_='lyrics').findAll('a')
    lyrics = []
    
    for i in range(len(lyrics_content)):
        lyrics.append(lyrics_content[i].get_text())
    
    return lyrics

In [6]:
song_path = fetch_song_info('Radiohead','true love waits',token)

In [7]:
lyrics = scrape_lyrics(song_path)

In [8]:
for verse in lyrics:
    print(verse)

I’ll drown my beliefs
To have your babies
I’ll dress like your niece
And wash your swollen feet
Just don’t leave
Don’t leave
I’m not living
I’m just killing time
Your tiny hands
Your crazy kitten smile
Just don’t leave
Don’t leave
And true love waits
In haunted attics
And true love lives
On lollipops and crisps
Just don’t leave
Don’t leave


### Billboard Hot 100 Songs 

In [9]:
def get_hot_100(date):
    """Function to scrape the Billboard Hot 100 tracks
       
       Parameters:
       -----------
       date(string): date to fetch data for. should be of YYYY-MM-DD format
       
       Returns:
       --------
       pandas dataframe containing the artist names and track titles
    
    """
    
    url = 'https://www.billboard.com/charts/hot-100/' + date
    response = requests.get(url)
    hot_soup = bs(response.content,'html.parser')
    
    no1_artist = hot_soup.find('div',class_="chart-number-one__artist").text.split('\n')[2]
    no1_title = hot_soup.find('div',class_="chart-number-one__title").text
    
    artists = [item["data-artist"] for item in hot_soup.find_all('div',class_="chart-list-item") if "data-artist" in item.attrs]
    titles = [item["data-title"] for item in hot_soup.find_all('div',class_="chart-list-item") if "data-title" in item.attrs]
    
    artists = [no1_artist] + artists
    titles = [no1_title] + titles
    
    hot100 = pd.DataFrame({'Artist':artists,'Title':titles})
    
    return hot100

In [132]:
hot100 = get_hot_100('2018-08-25')

In [137]:
for i,song in hot100.iterrows():
    
    path = fetch_song_info(song['Artist'],song['Title'],token)
    if path !='':
        l = scrape_lyrics(path)
        lyrics.append(l)
    else:
        not_found.append((song['Artist'],song['Title']))

In [138]:
not_found

[('The Carters', 'Apes**t'),
 ('Casper Magico, Nio Garcia, Darell, Nicky Jam, Ozuna & Bad Bunny',
  'Te Bote'),
 ('Quavo', 'W O R K I N  M E'),
 ('Travis Scott', 'R.I.P Screw'),
 ('Luke Combs', 'She Got The Best Of Me ')]

### Hot Songs within a date range

In [54]:
#user inputs starting date and ending date as strings
start_input = '2017-08-25'
end_input = '2018-08-25'

#convert to datetime objects
start_date = dt.datetime.strptime(start_input,'%Y-%m-%d').date()
end_date = dt.datetime.strptime(end_input,'%Y-%m-%d').date()
#create time delta for a week
delta = dt.timedelta(days=7)
weeks = round((end_date - start_date)/delta)

dates = [str(start_date+x*delta) for x in range(weeks+1)]

In [82]:
def get_hot_100_range(start,end):
    """Function to scrape the Billboard Hot 100 tracks over a date range
       
       Parameters:
       -----------
       date(string): date to fetch data for. should be of YYYY-MM-DD format
       
       Returns:
       --------
       pandas dataframe containing the artist names and track titles
    
    """

    #convert to datetime objects
    start_date = dt.datetime.strptime(start,'%Y-%m-%d').date()
    end_date = dt.datetime.strptime(end,'%Y-%m-%d').date()
    #create time delta for a week
    delta = dt.timedelta(days=7)
    weeks = round((end_date - start_date)/delta)

    dates = [str(start_date+x*delta) for x in range(weeks)]
    
    data = []
    
    for date in dates:
#         print(date)
        url = 'https://www.billboard.com/charts/hot-100/' + date
        response = requests.get(url)
        hot_soup = bs(response.content,'html.parser')

        no1_artist = hot_soup.find('div',class_="chart-number-one__artist").text.split('\n')[2]
        no1_title = hot_soup.find('div',class_="chart-number-one__title").text

        artists = [item["data-artist"] for item in hot_soup.find_all('div',class_="chart-list-item") if "data-artist" in item.attrs]
        titles = [item["data-title"] for item in hot_soup.find_all('div',class_="chart-list-item") if "data-title" in item.attrs]

        artists = [no1_artist] + artists
        titles = [no1_title] + titles
        
        for a,t in zip(artists,titles):
            if [a,t] not in data:
                data.append([a,t])
#         print(f'Data added for the week of date: {date}')
        
    hot100 = pd.DataFrame(data,columns=['Artist','Title'])
    
    return hot100

In [83]:
hot100_2018 = get_hot_100_range('2018-01-01','2018-09-01')

In [85]:
hot100_2018.head()

Unnamed: 0,Artist,Title
0,Ed Sheeran,Perfect
1,Post Malone Featuring 21 Savage,Rockstar
2,Camila Cabello Featuring Young Thug,Havana
3,Lil Pump,Gucci Gang
4,G-Eazy Featuring A$AP Rocky & Cardi B,No Limit


In [87]:
hot100_2018['Artist'].value_counts().head()

Drake           23
Travis Scott    17
Post Malone     14
J. Cole         11
XXXTENTACION     9
Name: Artist, dtype: int64