In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from random import uniform
from bs4 import BeautifulSoup

import pandas as pd
import time

In [3]:
CHROMEDRIVER_PATH = '/Users/alisayanovski/Programming/drivers/chromedriver'

options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
service = ChromeService(executable_path=CHROMEDRIVER_PATH)

In [75]:
# get songs list as DataFrame

songs_data = pd.read_csv('track_artist_audio_features_cleaned.csv')

In [76]:
# extract artist name and song name information, save them as tuples to a list

songs_list = []

for i in range(len(songs_data)):
    artist_name = songs_data.loc[i]['artist_name']
    song = songs_data.loc[i]['song']
    songs_list.append([artist_name, song])

In [13]:
# function for scraping each song

START_URL = 'https://www.azlyrics.com'
QUERY_SELECTOR = '#q'
SONG_LINK_SELECTOR = '.panel table > tbody:first-child a'
LYRICS_SELECTOR = '.col-xs-12.col-lg-8.text-center div:nth-of-type(5)'

def get_lyrics(artist_name, song):
    '''
    Takes as parameters two strings: artist_name and song.
    Returnes text of the lyrics for this artist 
    '''
    query = f'{artist_name.casefold()} {song.casefold()}'

    try:
        # go to the webpage
        driver.get(START_URL)
        
        # wait random time to imitate user behaviour
        time.sleep(uniform(2, 5))

        # write and send a query in a search field
        input_el = WebDriverWait(driver, 10).until(
                   EC.presence_of_element_located((By.CSS_SELECTOR, QUERY_SELECTOR)))
        input_el.send_keys(query + Keys.ENTER)

        # wait
        time.sleep(uniform(2, 5))
        
        # get first element of the search result and click the link
        song_link = WebDriverWait(driver, 10).until(
                   EC.presence_of_element_located((By.CSS_SELECTOR, SONG_LINK_SELECTOR)))

        song_link.click()
        
        # wait
        time.sleep(uniform(7, 12))

        # get the page with lyrics and save the text
        soup = BeautifulSoup(driver.page_source)        
        text = soup.select(LYRICS_SELECTOR)[0].text

        return text
    
    except:
        return ''

In [64]:
# initiate webdriver
driver = webdriver.Chrome(service=service, options=options)

# get the lyrics from the website and put into a list
lyrics = []

for artist_name, song in songs_list:
    text = get_lyrics(artist_name, song)
    lyrics.append((artist_name, song, text))

# quit the driver    
driver.quit()

In [69]:
#create dataframe, clean empty values and save to a file

df_lyrics = pd.DataFrame(lyrics, columns=['artist_name', 'song', 'lyrics'])
cleaned_lyrics = df_lyrics[df_lyrics['lyrics'] != '']
cleaned_lyrics.to_csv('lyrics_selenium.csv', index=False)

In [None]:
# merge lyrics with the rest of information

ids_part = songs_data[['artist_name', 'artist_id', 'song', 'track_id']]
ids_part_merged = pd.merge(cleaned_lyrics, ids_part, on=['artist_name', 'song'], how='left')
no_duplicates_df = ids_part_merged.dropna().drop_duplicates(subset=['artist_name', 'song'])

no_duplicates_df.to_csv('final_csv.csv', index=False)