In [64]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import secrets
import contractions
import re
import seaborn as sns
import matplotlib.pyplot as plt
import lyricsgenius
from datetime import datetime
import spacy
import importlib

In [64]:
genius = lyricsgenius.Genius(secrets.GENIUS_ACCESS_TOKEN)

In [None]:
def get_billboard_from_year(start_year, end_year):
    years = np.arange(start_year, end_year + 1).astype(int)
    top_songs = pd.DataFrame()

    for i in range(0, len(years)):
        year = years[i]
        print("Collecting songs from %i...", year)
        url = "https://www.billboard.com/charts/year-end/"+ str(year) +"/hot-100-songs"
        
        billboard_page = requests.get(url)
        soup = BeautifulSoup(billboard_page.text, "html.parser")

        titles = [div.text for div in soup.find_all("div", "ye-chart-item__title")]
        ranks = [div.text for div in soup.find_all("div", "ye-chart-item__rank")]
        artists = [div.text for div in soup.find_all("div", "ye-chart-item__artist")]

        for i in range(0, len(ranks)):
            row = {
                "Rank": ranks[i].replace("\n", "").strip(),
                "Song Title": titles[i].replace("\n", "").strip(),
                "Artist": artists[i].replace("\n", "").strip(),
                "Year": int(year)
            }
            top_songs = top_songs.append(row, ignore_index=True)
    return top_songs

In [None]:
all_songs = get_billboard_from_year(2019, 2019)

In [None]:
display(all_songs.head(5))
display(all_songs.tail(5))
display(all_songs.shape)

In [75]:
def expand_contractions(text, contraction_mapping=contractions.CONTRACTION_MAP):
    print("keys: " + str('|'.join(contraction_mapping.keys())))
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    print("cont_patt: " + str(contractions_pattern))
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text


In [None]:
starttime = datetime.now()
print("Starting: " + str(starttime))
all_song_data = pd.DataFrame()
for i in range(0, len(all_songs)):
    song = all_songs.iloc[i]

    try:
        song_data = genius.search_song(song['Song Title'], song["Artist"])
#         print(dir(song_data))
        if(song_data):
            song_album = song_data.album
            featured_artists = song_data.featured_artists
            song_lyrics = song_data.lyrics.replace("\n", " ")
            song_media = song_data.media
            song_url = song_data.url
            song_writer_artists = song_data.writer_artists
            song_producer_artists = song_data.producer_artists if song_data.producer_artists else ""
            song_album_url = song_data.album_url
            song_release_year = song_data.year
    except:
        song_album = "null"
        song_album_url = "null"
        featured_artists = "null"
        song_lyrics = "null"
        song_media = "null"
        song_url = "null"
        song_writer_artists = "null"
        song_release_year = "null"
        song_producer_artists = song_data.producer_artists
        song_album_url = song_data.album_url
        song_release_year = song_data.year
        
    row = {
        "Year": song['Year'],
        "Charting Rank": song['Rank'],
        "Song Title": song['Song Title'],
        "Artist": song['Artist'],
        "Album": song_album ,
        "Producers": song_producer_artists,
        "Writers": song_writer_artists,
        "Album URL": song_album_url,
        "Featured Artists": featured_artists,
        "Lyrics":  song_lyrics,
        "URL": song_url,
        "Media": song_media,
        "Release Year": song_release_year
    }
    all_song_data = all_song_data.append(row, ignore_index=True)

endtime = datetime.now()
print("Finished Job: " + str(endtime))
print("Elapsed Time: " + str(endtime - starttime))


In [None]:
# all_song_data.to_pickle(r"stored_song_data.pickle")

In [None]:
loaded_song_data = pd.read_pickle("stored_song_data.pickle")

In [None]:
expand_contractions(all_song_data['Lyrics'])

In [None]:
display(loaded_song_data.head(5))
display(loaded_song_data.shape)

In [None]:
def plot_count_by_year(dataset):
    ['Year'], ['Rank']
    grouped_dataset = dataset.groupby('Year')
    sns.barplot(y=grouped_dataset['Charting Rank'].count(), x=grouped_dataset.count().index)
    plt.title("Number of songs with Genius data by year")
    plt.ylabel("Number of Songs")

In [None]:
plot_count_by_year(loaded_song_data)

In [9]:
nlp = spacy.load("en_core_web_sm")

def add_spacy_data(dataset):
    
    verbs = []
    nouns = []
    adverbs = []
    corpus = []
#     entities = []
    for i in range(0, len(dataset)):
        print("SpaCy now processing {} by {}".format(dataset.iloc[i]['Song Title'], dataset.iloc[i]['Artist']))
        song = re.sub(r'\n', '', dataset.iloc[i]["Lyrics"])
        doc = nlp(song)
        spacy_df = pd.DataFrame()
        for token in doc:
            row = {
                "word": token.text,
                "lemma": token.lemma_ if token.lemma_ != "-PRON-" else token.text ,
                "pos": token.pos_,
                "stop word": token.is_stop,
            }
            spacy_df = spacy_df.append(row, ignore_index=True)
        verbs.append(" ".join(spacy_df['lemma'][spacy_df["pos"] == "VERB"].values))
        nouns.append(" ".join(spacy_df['lemma'][spacy_df["pos"] == "NOUN"].values))
        adverbs.append(" ".join(spacy_df['lemma'][spacy_df["pos"] == "ADV"].values))
        corpus1 = " ".join(spacy_df['lemma'][spacy_df["stop word"] == False].values)
        corpus1 = re.sub(r'[^A-Za-z0-9]+', ' ', corpus1)
        corpus.append(corpus1)
#         entities = entities.append[doc.ents]
    dataset['Verbs'] = verbs
    dataset['Nouns'] = nouns
    dataset['Adverbs'] = adverbs
    dataset['Corpus'] = corpus
#     dataset['Entities'] = entities
    return dataset

In [None]:
add_spacy_data(loaded_song_data)

In [None]:
loaded_song_data.to_pickle(r"stored_song_data_with_spacy.pickle")

In [5]:
loaded_song_data_with_spacy =  pd.read_pickle("stored_song_data_with_spacy.pickle")

In [6]:
display(loaded_song_data_with_spacy.head())


Unnamed: 0,Album,Album URL,Artist,Charting Rank,Featured Artists,Lyrics,Media,Producers,Release Year,Song Title,URL,Writers,Year,Verbs,Nouns,Adverbs,Corpus
0,÷ (Divide),https://genius.com/albums/Ed-sheeran/Divide,Ed Sheeran,1,[],[Verse 1]\nThe club isn't the best place to fi...,"[{'provider': 'youtube', 'start': 0, 'type': '...","[{'api_path': '/artists/12418', 'header_image_...",2017-01-06,Shape Of You,https://genius.com/Ed-sheeran-shape-of-you-lyrics,"[{'api_path': '/artists/7393', 'header_image_u...",2017.0,find gome do talk come start trust will give n...,verse club place loverso bar friend table shot...,where fast then just then now now too now now ...,verse 1 The club good place find loverso bar ...
1,Google Translate Sings,https://genius.com/albums/Malinda/Google-trans...,Luis Fonsi & Daddy Yankee Featuring Justin Bieber,2,[],"[Intro]\nCome at me\nMany priests, the good th...","[{'provider': 'youtube', 'start': 0, 'type': '...",,2017-07-19,Despacito,https://genius.com/Malinda-google-translate-si...,"[{'api_path': '/artists/1460051', 'header_imag...",2017.0,make feel want put seem feel can spend make co...,priest thing condition dayHe night equality lo...,slowly slowly how how only very how soonsimply...,Intro Come memany priest good thing therechan...
2,24K Magic,https://genius.com/albums/Bruno-mars/24k-magic,Bruno Mars,3,[],"[Verse 1]\nHey, hey, hey\nI got a condo in Man...",[{'native_uri': 'spotify:track:0KKkJNfGyhkQ5aF...,"[{'api_path': '/artists/1012903', 'header_imag...",2016-01-30,That's What I Like,https://genius.com/Bruno-mars-thats-what-i-lik...,"[{'api_path': '/artists/1035208', 'header_imag...",2017.0,get pop pop drop drop rent -ami)wake serve get...,condo girl ass gon playerdrop beach house jamm...,around just alland so never never just alland ...,Verse 1 hey hey heyI get condo ManhattanBaby ...
3,DAMN.,https://genius.com/albums/Kendrick-lamar/Damn,Kendrick Lamar,4,[],[Intro]\nNobody pray for me\nIt's been that da...,"[{'provider': 'youtube', 'start': 6, 'type': '...","[{'api_path': '/artists/627151', 'header_image...",2017-03-30,Humble.,https://genius.com/Kendrick-lamar-humble-lyrics,"[{'api_path': '/artists/93851', 'header_image_...",2017.0,pray remember allowancesfinesse be live aid ca...,day meway syrup sandwich crime nigga counterfe...,now where way too then just still still just u...,intro nobody pray meIt day meway yeah yeah Ve...
4,Memories...Do Not Open,https://genius.com/albums/The-chainsmokers/Mem...,The Chainsmokers & Coldplay,5,[],[Verse 1: Chris Martin]\nI've been reading boo...,"[{'provider': 'youtube', 'start': 0, 'type': '...","[{'api_path': '/artists/1030536', 'header_imag...",2017-02-22,Something Just Like This,https://genius.com/The-chainsmokers-and-coldpl...,"[{'api_path': '/artists/150934', 'header_image...",2017.0,read see say look can turn can kiss[bridge wan...,verse book legend mythsachille goldhercule gif...,clearly where much just just just just where m...,verse 1 Chris martin i ve read book oldthe le...


In [71]:
importlib.reload(contractions)

<module 'contractions' from 'C:\\Users\\bhavi\\Documents\\Coding\\thrutheyears\\contractions.py'>

In [72]:
def expand_contractions(text, contraction_mapping=contractions.CONTRACTION_MAP):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
#     contractions_pattern = re.compile('|'.join(contraction_mapping.keys()), 
#                                       flags=re.IGNORECASE|re.DOTALL)
    print(contractions_pattern.pattern)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())  
        if first_char != "'":
            expanded_contraction = first_char+expanded_contraction[1:]
#         print(match + ": "+ expanded_contraction)
        return expanded_contraction
        
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [None]:
def remove_adlibs(text):
    

In [73]:
# testing expanding contractions.. will have to take place in cleaning lyrics after getting from genius api
ti =3 #3 = humble 5 = bad and boujee
song = loaded_song_data_with_spacy.iloc[ti]
# lyrics = re.sub(r'\n', ' ', song['Lyrics'])
lyrics = song['Lyrics']
lyrics_contr = expand_contractions(lyrics)
corpus = song['Corpus']
doc = nlp(lyrics_contr)
spacy_df = pd.DataFrame()
ents = [(i, i.label_, i.label) for i in doc.ents]
display(lyrics_contr)
display(lyrics)
# print(lyrics_hol)
# display(find_all_ing_contractions(lyrics_hol))
# print("{} by {}: {}\n".format(song['Song Title'], song['Artist'], ents))

(ain't|aren't|can't|can't've|'cause|could've|couldn't|couldn't've|didn't|doesn't|don't|hadn't|hadn't've|hasn't|haven't|he'd|he'd've|he'll|he'll've|he's|how'd|how'd'y|how'll|how's|I'd|I'd've|I'll|I'll've|I'm|I've|i'd|i'd've|i'll|i'll've|i'm|i've|isn't|it'd|it'd've|it'll|it'll've|it's|let's|ma'am|mayn't|might've|mightn't|mightn't've|must've|mustn't|mustn't've|needn't|needn't've|o'clock|oughtn't|oughtn't've|shan't|sha'n't|shan't've|she'd|she'd've|she'll|she'll've|she's|should've|shouldn't|shouldn't've|so've|so's|that'd|that'd've|that's|there'd|there'd've|there's|they'd|they'd've|they'll|they'll've|they're|they've|to've|wasn't|we'd|we'd've|we'll|we'll've|we're|we've|weren't|what'll|what'll've|what're|what's|what've|when's|when've|where'd|where's|where've|who'll|who'll've|who's|who've|why's|why've|will've|won't|won't've|would've|wouldn't|wouldn't've|y'all|y'all'd|y'all'd've|y'all're|y'all've|you'd|you'd've|you'll|you'll've|you're|you've|cali|cali'|smokin'|'em |gonna|gon'|nawf|nothin'|cookin

'[Intro]\nNobody pray for me\nIt is been that day for me\nWay (Yeah, yeah)\n\n[Verse 1]\nAyy, I remember syrup sandwiches and crime allowances\nFinesse a nigga with some counterfeits, but now I’m counting this\nParmesan where my accountant lives, in fact I am downin’ this\nDUSSÉ with my boo bae tastes like Kool-Aid for the analysts\nGirl, I can buy your ass the world with my paystub\nOoh, that pussy good, will not you sit it on my taste bloods?\nI get way too petty once you let me do the extras\nPull up on your block, then break it down: we playing Tetris\nAM to the PM, PM to the AM, funk\nPiss out your per diem, you just gotta hate em, funk\nIf I quit your BM, I still ride Mercedes, funk\nIf I quit this season, I still be the greatest, funk\nMy left stroke just went viral\nRight stroke put little baby in a spiral\nSoprano C, we like to keep it on a high note\nIt is levels to it, you and I know\n\n[Chorus]\nBitch, be humble (Hold up, bitch)\nSit down (Hol’ up, lil’, hold up, lil’ bitch

"[Intro]\nNobody pray for me\nIt's been that day for me\nWay (Yeah, yeah)\n\n[Verse 1]\nAyy, I remember syrup sandwiches and crime allowances\nFinesse a nigga with some counterfeits, but now I’m countin' this\nParmesan where my accountant lives, in fact I'm downin’ this\nD'USSÉ with my boo bae tastes like Kool-Aid for the analysts\nGirl, I can buy your ass the world with my paystub\nOoh, that pussy good, won't you sit it on my taste bloods?\nI get way too petty once you let me do the extras\nPull up on your block, then break it down: we playin' Tetris\nAM to the PM, PM to the AM, funk\nPiss out your per diem, you just gotta hate 'em, funk\nIf I quit your BM, I still ride Mercedes, funk\nIf I quit this season, I still be the greatest, funk\nMy left stroke just went viral\nRight stroke put lil' baby in a spiral\nSoprano C, we like to keep it on a high note\nIt's levels to it, you and I know\n\n[Chorus]\nBitch, be humble (Hol' up, bitch)\nSit down (Hol’ up, lil’, hol' up, lil’ bitch)\nBe 

In [None]:
t = nlp("Sandman")
ents = [(i, i.label_, i.label) for i in t.ents]
print(ents)