# Lyrics Analysis Attempt (Genius)

In [1]:
import lyricsgenius as genius
import pandas as pd
import string 

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('wordnet')      


def search_data(query,n,access_token):
    """
    This function uses the library lyricsgenius to extract the fields
    title, artist, album, date and lyrics and stores them into a pandas dataframe
    parameters:
    query = artist or band to search
    n = max numbers of songs
    access_token = your access token of the genius api
    """
    
    api = genius.Genius(access_token)

    list_lyrics = []
    list_title = []
    list_artist = []
    list_album = []
    list_year = []

    artist = api.search_artist(query,max_songs=n,sort='popularity')
    songs = artist.songs
    for song in songs:
        list_lyrics.append(song.lyrics)
        list_title.append(song.title)
        list_artist.append(song.artist)
        list_album.append(song.album)
        list_year.append(song.year)

    df = pd.DataFrame({'artist':list_artist,'title':list_title,'album':list_album,
                        'date':list_year,'lyric':list_lyrics})
    
    return df

def clean_lyrics(df,column):
    """
    This function cleans the words without importance and fix the format of the  dataframe's column lyrics 
    parameters:
    df = dataframe
    column = name of the column to clean
    """
    df = df
    df[column] = df[column].str.lower()
    df[column] = df[column].str.replace(r"verse |[1|2|3]|chorus|bridge|outro","").str.replace("[","").str.replace("]","")
    df[column] = df[column].str.lower().str.replace(r"instrumental|intro|guitar|solo","")
    df[column] = df[column].str.replace("\n"," ").str.replace(r"[^\w\d'\s]+","").str.replace("efil ym fo flah","")
    df[column] = df[column].str.strip()

    return df

def lyrics_to_words(document):
    """
    This function splits the text of lyrics to  single words, removing stopwords and doing the lemmatization to each word
    parameters:
    document: text to split to single words
    """
    stop_words = set(stopwords.words('english'))
    exclude = set(string.punctuation)
    lemma = WordNetLemmatizer()
    stopwordremoval = " ".join([i for i in document.lower().split() if i not in stop_words])
    punctuationremoval = ''.join(ch for ch in stopwordremoval if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punctuationremoval.split())
    return normalized

def create_decades(df):
    """
    This function creates a new column called decades used to group the songs and lyrics by decade based on the date released 
    for each song
    parameters:
    df = dataframe
    """
    years = []
    decades = []
    df['date'].fillna(0)
    df['date'] = df['date'].astype("str")
    for i in df.index:
        years.append(df['date'].str.split("-")[i][0])
    df['year'] = years
    df['year'] = df['year'].astype("int")

    for year in df['year']:
        if 1970 <= year < 1980:
            decades.append("70s")
        if 1980 <= year < 1990:
            decades.append("80s")
        if 1990 <= year < 2000:
            decades.append("90s")
        if 2000 <= year < 2010:
            decades.append("00s")
        if 2010 <= year :
            decades.append("10s")
    df['decade'] = decades
    df = df[['artist','title','album','decade','year','date','lyric']]
    return df

In [2]:
token = "1VHE7q4FnW4XcMobQtv2RLtN-hc2ZgWMXm0zOTsIHptb9dOeNzddpuFRm4kT_CcZ"

In [3]:
df0 = search_data('Metallica',5,token)

Searching for songs by Metallica...

Song 1: "Nothing Else Matters"
Song 2: "One"
Song 3: "Enter Sandman"
Song 4: "Master of Puppets"
Song 5: "The Unforgiven"

Reached user-specified song limit (5).
Done. Found 5 songs.


In [4]:
df0.head()

Unnamed: 0,artist,title,album,date,lyric
0,Metallica,Nothing Else Matters,Metallica,1991-08-12,"[Guitar Intro]\n\n[Verse 1]\nSo close, no matt..."
1,Metallica,One,...And Justice for All,1989-08-25,[Instrumental Intro]\n\n[Verse 1]\nI can't rem...
2,Metallica,Enter Sandman,Metallica,1991-08-12,[Instrumental Intro]\n\n[Verse 1]\nSay your pr...
3,Metallica,Master of Puppets,Master of Puppets,1986-03-03,"[Verse 1]\nEnd of passion play, crumbling away..."
4,Metallica,The Unforgiven,Metallica,1991-10-28,"[Verse 1]\nNew blood joins this Earth, and qui..."


In [6]:
genius = genius.Genius(token, remove_section_headers=True,
                 skip_non_songs=True, excluded_terms=["Remix", "Live", "Edit", "Mix", "Club"])

In [7]:
import time

In [8]:
#Create list of sample artists
sample_artists = ['Rihanna', 'Justin Timberlake']

#Starting the song search for the artists in question and seconds count
query_number = 0
time1 = time.time()
for artist in sample_artists:
    query_number += 1
    #Empty lists for artist, title, album and lyrics information
    artists = []
    titles = []
    albums = []
    years = []
    lyrics = []
    print('\nQuery number:', query_number)
    #Search for max_songs = n and sort them by popularity
    artist = genius.search_artist(artist, max_songs = 3, sort='popularity')
    songs = artist.songs
    song_number = 0
    #Append all information for each song in the previously created lists
    for song in songs:
        if song is not None:
            song_number += 1
            print('\nSong number:', song_number)
            print('\nNow adding: Artist')
            artists.append(song.artist)
            print('Now adding: Title')
            titles.append(song.title)
            print('Now adding: Album')
            albums.append(song.album)
            print('Now adding: Year')
            years.append(song.year[0:4])
            print('Now adding: Lyrics')
            lyrics.append(song.lyrics)
    time2 = time.time()
    print('\nQuery', query_number, 'finished in', round(time2-time1,2), 'seconds.')


Query number: 1
Searching for songs by Rihanna...

Song 1: "Work"
Song 2: "Love on the Brain"
Song 3: "Needed Me"

Reached user-specified song limit (3).
Done. Found 3 songs.

Song number: 1

Now adding: Artist
Now adding: Title
Now adding: Album
Now adding: Year
Now adding: Lyrics

Song number: 2

Now adding: Artist
Now adding: Title
Now adding: Album
Now adding: Year
Now adding: Lyrics

Song number: 3

Now adding: Artist
Now adding: Title
Now adding: Album
Now adding: Year
Now adding: Lyrics

Query 1 finished in 11.42 seconds.

Query number: 2
Searching for songs by Justin Timberlake...

Song 1: "Mirrors"
Song 2: "Suit & Tie"
Song 3: "Say Something"

Reached user-specified song limit (3).
Done. Found 3 songs.

Song number: 1

Now adding: Artist
Now adding: Title
Now adding: Album
Now adding: Year
Now adding: Lyrics

Song number: 2

Now adding: Artist
Now adding: Title
Now adding: Album
Now adding: Year
Now adding: Lyrics

Song number: 3

Now adding: Artist
Now adding: Title
Now addi

In [9]:
tracklist = pd.DataFrame({'artist':artists, 'title':titles, 'album':albums, 'year':years, 'lyrics':lyrics})

In [10]:
tracklist.head()

Unnamed: 0,artist,title,album,year,lyrics
0,Justin Timberlake,Mirrors,The 20/20 Experience: The Complete Experience,2013,Aren't you somethin' to admire?\n'Cause your s...
1,Justin Timberlake,Suit & Tie,The 20/20 Experience: The Complete Experience,2013,"Ooh-oh\nI be on my suit and tie, shit tied, sh..."
2,Justin Timberlake,Say Something,Man of the Woods,2018,"Mhmm, yeah, alright\nOoh\n\nEveryone knows all..."
