# Lyrics Using Genius API

In [2]:
pip install lyricsgenius

Collecting lyricsgenius
  Using cached lyricsgenius-3.0.1-py3-none-any.whl (59 kB)
Installing collected packages: lyricsgenius
Successfully installed lyricsgenius-3.0.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
#import libraries
import pandas as pd
import lyricsgenius as genius #used to interface with Genius API
import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [4]:
#token provided by Genius API
%store -r client_access_token

no stored variable or alias client_access_token


In [6]:
client_access_token = 'qqoXje06ERBkSJz9qtBBl1HzJyLoxuC51bU2bftPjMgS1XnkUeyVQ5_9ZNjvL6g7'

In [7]:
#initiate Genius
genius = genius.Genius(client_access_token)
genius.verbose = False #turn off status messages

### Functions

In [8]:
def get_lyrics(track,artist):
    '''
    function returns song's lyrics
    parameters:
        track-->str
        artist-->str
    '''
    track = re.sub(' - .+','',track) #remove text after '-'
    track = re.sub(' \(.*\)','',track) #remove text within parentheses
    track = re.sub(' \[.*\]','',track) #remove text within brackets
    
    try:
        return genius.search_song(track,artist).lyrics
    except:
        print(track + ' by ' + artist + ' is not available')

In [9]:
def get_df_songs(track_list,artist_list):
    '''
    function obtains lyrics and returns dataframe with columns for track, artist, lyrics
    parameters:
        track_list-->list of str 
        artist_list-->list of str
    '''
    lyrics_list = [get_lyrics(track_list[x],artist_list[x]) for x in range(len(track_list))] #get lyrics for each song
    
    return pd.DataFrame(data={'track':track_list,'artist':artist_list,'lyrics':lyrics_list})

In [10]:
def clean_lyrics(df,col,new_col):
    '''
    function returns dataframe with new column of cleaned text (song lyrics)
    parameters:
        df-->pandas dataframe
        col-->column to clean (str)
        new_col-->name of column with cleaned text (str)
    '''
    df[new_col] = df[col].str.lower() #make all text lowercase
    df[new_col] = df[new_col].str.replace(r'\n',' ') #replace '\n' character with space
    df[new_col] = df[new_col].str.replace(r'\[[^\[\]]*]','') #remove brackets and inside text
    df[new_col] = df[new_col].str.replace(r"\'\w*",'').str.replace(r'[^\w\d\s]+','') #remove extra characters
    df[new_col] = df[new_col].str.strip() #remove extra whitespace
    
    return df

In [11]:
def normalize_lyrics(df,col):
    '''
    function returns dataframe with column as list of words
        tokenizes, removes stopwords from, and lemmatizes lyrics
    parameters:
        df-->pandas dataframe
        col-->column to normalize
    '''

    df[col] = df[col].str.split() #tokenize lyrics
    
    stop_words = set(stopwords.words('english'))
    df[col] = df[col].apply(lambda row: [w for w in row if w not in stop_words]) #remove stopwords

    lemmatizer = WordNetLemmatizer()
    
    def lemmatize_text(text):
        '''
        function returns lemmatized text
        parameters:
            text-->str
        '''
        return [lemmatizer.lemmatize(w) for w in text]
    
    df[col] = df[col].apply(lemmatize_text) #lemmatize words

    return df

In [79]:
import re
def segmenting(s):
    s = s.replace('\n',' ')
    structures = re.findall(r'\[.*?\]', s)
    iter = re.finditer(r'\[.*?\]', s)
    indices = [m.start(0) for m in iter]
    splits = []
    for i in range(len(indices)):
        if i != len(indices)-1:
            splits.append((structures[i], s[indices[i]+len(structures[i]): indices[i+1]]))
        else:
            splits.append((structures[i], s[indices[i]+len(structures[i]):]))
    return splits

### Import Songs to Analyze

Read in the resulting dataframes from the spotify_analysis notebook, which were created as follows:

 - Started with the top five tracks for each of country, R&B/hip-hop, and rock/alternative as of the week of May 15, 2021, based on Billboard Top 100 charts (referred to as the "seed tracks")
 - Used Spotify's recommender algorithm to find the most similar songs to the seed tracks (returns a maximum of 100 songs per search)
 - Ranked the most similar songs by audio features using Euclidean distance
 - Fed the top ranking songs through Spotify's recommender algorithm until there were at least 1,000 songs per genre

In [12]:
#country
df_cty = pd.read_csv('../Data/df_cty.csv')
df_cty.drop(columns='Unnamed: 0',inplace=True)

In [13]:
#rock/alternative
df_rock = pd.read_csv('../Data/df_rock.csv')
df_rock.drop(columns='Unnamed: 0',inplace=True)

### Obtain Lyrics from Genius API

Pull lyrics from the Genius API with the lyricsgenius wrapper and put into dataframes.

In [14]:
#country
df_cty_lyrics = get_df_songs(df_cty['track'],df_cty['artist'])

ONE BEER by HIXTAPE is not available
Cowboy Killer by Ian Munsick is not available
Before He Cheats by Carrie Underwood is not available
Hotel Room by Do Or Die is not available
Dusa by FL Dusa is not available
Two Pina Coladas by Brooks Jefferson is not available


In [15]:
#rock
df_rock_lyrics = get_df_songs(df_rock['track'],df_rock['artist'])

Back Door Santa by The Black Crowes is not available
Welcome To The War by 7kingZ is not available
White Rabbit by Egypt Central is not available
2L8 by Ryan Oakes is not available
The Shower Scene by Ice Nine Kills is not available
Oh Betty by Fantastic Negrito is not available
Stand And Deliver by Goodbye June is not available
Moon Over the Castle by Bring Me The Horizon is not available
Ti**ies by Krizz Kaliko is not available


### Clean and Preprocess Lyrics

Prepare lyrics for analysis by cleaning and normalizing them.

In [16]:
pd.set_option('mode.chained_assignment', None)

In [17]:
#drop rows without lyrics
df_cty_lyrics2 = df_cty_lyrics.dropna(subset=['lyrics']) #country
df_rock_lyrics2 = df_rock_lyrics.dropna(subset=['lyrics']) #rock/alternative

In [18]:
#clean lyrics
df_cty_cleaned = clean_lyrics(df_cty_lyrics2,'lyrics','words') #country
df_rock_cleaned = clean_lyrics(df_rock_lyrics2,'lyrics','words') #rock/alternative

  df[new_col] = df[new_col].str.replace(r'\n',' ') #replace '\n' character with space
  df[new_col] = df[new_col].str.replace(r'\[[^\[\]]*]','') #remove brackets and inside text
  df[new_col] = df[new_col].str.replace(r"\'\w*",'').str.replace(r'[^\w\d\s]+','') #remove extra characters


In [19]:
#normalize lyrics
df_cty_norm = normalize_lyrics(df_cty_cleaned,'words') #country
df_rock_norm = normalize_lyrics(df_rock_cleaned,'words') #rock/alternative

In [83]:
rock_segments = []
for lyrics in df_rock_norm['lyrics']:
    rock_segments.append(segmenting(lyrics))

In [84]:
cty_segments = []
for lyrics in df_cty_norm['lyrics']:
    cty_segments.append(segmenting(lyrics))

In [85]:
df_rock_norm['segments'] = rock_segments

In [86]:
df_cty_norm['segments'] = cty_segments

### Write Dataframes to File

In [88]:
#country
df_cty_norm.to_csv('../Data/df_cty_lyrics.csv')

In [89]:
#rock/alternative
df_rock_norm.to_csv('../Data/df_rock_lyrics.csv')