# Web Scraping for Song Lyrics

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Scraping-Genius.com-(requests/BeautifulSoup)" data-toc-modified-id="Scraping-Genius.com-(requests/BeautifulSoup)-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Scraping Genius.com (requests/BeautifulSoup)</a></span><ul class="toc-item"><li><span><a href="#Multithreading-for-Genius-Scrape" data-toc-modified-id="Multithreading-for-Genius-Scrape-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Multithreading for Genius Scrape</a></span></li></ul></li><li><span><a href="#Scraping-Musixmatch.com-(Selenium)" data-toc-modified-id="Scraping-Musixmatch.com-(Selenium)-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Scraping Musixmatch.com (Selenium)</a></span><ul class="toc-item"><li><span><a href="#Proxies-for-Selenium" data-toc-modified-id="Proxies-for-Selenium-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Proxies for Selenium</a></span></li></ul></li><li><span><a href="#Find-Kaggle-Lyrics-Not-Already-Added-to-FMA-Dataset" data-toc-modified-id="Find-Kaggle-Lyrics-Not-Already-Added-to-FMA-Dataset-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Find Kaggle Lyrics Not Already Added to FMA Dataset</a></span></li></ul></div>

In [None]:
driver = webdriver.Chrome('./chromedriver')

In [4]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from selenium import webdriver
from time import time
import pandas as pd
import numpy as np
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
from tqdm import tqdm_notebook
import multiprocessing as mp
import sys
import threading
from queue import Queue


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

def selenium_musix(song_title, artist_name):
    start_main = time()
    search_term = (song_title+' '+artist_name).replace(' ','%20')
    search_url = "https://www.musixmatch.com/search/{}".format(search_term)
    driver.get(search_url)
    start = time()
    while time()-start<0.1: continue
    results = driver.find_elements_by_class_name('media-card-text')
    if len(results)>0:
        try:
            
            for result in results:
                if song_title.lower().strip() in result.find_element_by_class_name('title').text.lower().strip() \
                    and artist_name.lower().strip() in result.find_element_by_class_name('artist').text.lower().strip():
                    result.find_element_by_class_name('title').click()
                    return driver.page_source
                else:
                    pass
        except:
            return np.nan

        
    #     start=time()
    #     #while time()-start<0.15: continue
    #     try:
    #         lyrics_list = driver.find_elements_by_class_name('lyrics__content__ok')
    #         if len(lyrics_list) == 0:
    #             raise Exception()
    #     except:
    #         lyrics_list = driver.find_elements_by_class_name('lyrics__content__warning')

    #     lyrics = ' '.join([l.text for l in lyrics_list])

    else:
        pass
    return np.nan

### Scraping Genius.com (requests/BeautifulSoup)

In [None]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def request_song_info_json(song_title, artist_name):
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + 'bghsGyhlDAFazVYdOmmUhajrHwX4Qu0jm2mktQ6wyI8iYw9YExBdwn4G8giQRmvQ'}
    search_url = base_url + '/search'
    try:
        data = {'q': song_title + ' ' + artist_name}
    
        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        response = session.get(search_url, data=data, headers=headers)
        
        return response.json()
    except:
        return False
    
    
def request_lyric_url(response_json, song_title, artist_name):
    if response_json:
        hits = response_json['response']['hits']
        result = False
        for hit in hits:
            if song_title.lower().strip() in hit['result']['title'].lower().strip() \
                and artist_name.lower().strip() in hit['result']['primary_artist']['name'].lower().strip():
                result = hit
            else:
                pass
        if result:
            return result['result']['url']
        else:
            pass              
    else:
        return False


def scrape_song_url(url):
    if url:
        try:
            page = requests.get(url)
            html = BeautifulSoup(page.text, 'html.parser')
            lyrics = html.find('div', class_='lyrics').get_text()
            return lyrics
        except:
            return np.nan   
    else:
        return np.nan


def get_lyrics(song_title, artist_name):
    
    response = request_song_info_json(song_title, artist_name)
    
    url = request_lyric_url(response, song_title, artist_name)
    
    return scrape_song_url(url)







#### Multithreading for Genius Scrape

In [None]:
def get_lyrics_mthread_musix(q, return_dict):
    
     while not q.empty():
        
        track = q.get()
        
        lyrics = get_lyrics_musix(track[1][0], track[1][1])

        return_dict[track[0]] = lyrics
        
        q.task_done()

def get_lyrics_mthread(q, return_dict):
    
    while not q.empty():
        
        track = q.get()
    
        response = request_song_info_json(track[1][0], track[1][1])

        url = request_lyric_url(response, track[1][0], track[1][1])

        return_dict[track[0]] = scrape_song_url(url)
        
        q.task_done()
    

def request_concurrent(queue, num_threads = 20, function=get_lyrics_mthread_musix):
    """Requests lyrics for each track in 'tracks' dataframe

    Parameters
    ---------
    songtitles: pandas series of song titles
    artists: pandas series of artist names

    Returns
    -------
    return_list: the list containing the beautiful soup of the web scraping results
    """
    sys.stdout.write("Requesting in parallel...\n")

    manager = mp.Manager()
    return_dict = manager.dict()
    #threads = []
    
    for i in tqdm_notebook(range(num_threads)):
        
       
        
            
        # thread calls the function get_html with the 
        # arguments city, page_id and return_list
        # the name serves as an id for the current thread

        thread = threading.Thread(
                                  target=function, 
                                  args=(queue, return_dict))
        thread.start()
        #thread.setDaemon(True)
        #threads.append(thread)
    
    tqdm_notebook(queue.join())
    
    print("Waiting for queue to finish execution.")
    return return_dict

# lyrics_dict = {}
# for track in tqdm_notebook(id_song_and_artist):
    
#     lyrics = get_lyrics_musix(track[1][0], track[1][1])

#     lyrics_dict[track[0]] = lyrics

# q = Queue()

# results = [{} for x in id_song_and_artist];
# #load up the queue with the urls to fetch and the index for each job (as a tuple):
# for i in tqdm_notebook(range(len(id_song_and_artist))):
#     #need the index and the url in each queue item.
#     q.put((id_song_and_artist[i][0], id_song_and_artist[i][1]))
    
# lyrics = request_concurrent(q)



len(lyrics)

lyrics_df = pd.DataFrame(data = lyrics.values(), index = lyrics.keys(), columns=['lyrics'])

tracks.head(58)

lyrics_df.sort_index(inplace=True)
lyrics_df.head()

lyrics_df.count()

lyrics_df.describe()

#lyrics_df.to_csv('lyrics')

### Scraping Musixmatch.com (Selenium)

In [None]:
from selenium import webdriver
from time import time

#driver = webdriver.Chrome('./chromedriver')

def selenium_musix(song_title, artist_name):
    start_main = time()
    search_term = (song_title+' '+artist_name).replace(' ','%20')
    search_url = "https://www.musixmatch.com/search/{}".format(search_term)
    driver.get(search_url)
    start = time()
    while time()-start<0.1: continue
    results = driver.find_elements_by_class_name('media-card-text')
    if len(results)>0:
        try:
            
            for result in results:
                if song_title.lower().strip() in result.find_element_by_class_name('title').text.lower().strip() \
                    and artist_name.lower().strip() in result.find_element_by_class_name('artist').text.lower().strip():
                    result.find_element_by_class_name('title').click()
                    return driver.page_source
                else:
                    pass
        except:
            return np.nan

        
    #     start=time()
    #     #while time()-start<0.15: continue
    #     try:
    #         lyrics_list = driver.find_elements_by_class_name('lyrics__content__ok')
    #         if len(lyrics_list) == 0:
    #             raise Exception()
    #     except:
    #         lyrics_list = driver.find_elements_by_class_name('lyrics__content__warning')

    #     lyrics = ' '.join([l.text for l in lyrics_list])

    else:
        pass
    return np.nan




# lyrics_html_dict = {}
# for track in tqdm_notebook(id_song_and_artist[:10000]):
    
#     lyrics_html = selenium_musix(track[1][0], track[1][1])

#     lyrics_html_dict[track[0]] = lyrics_html
    
# pd.DataFrame.from_dict(lyrics_html_dict, orient='index').to_csv('lyrics_html_1')


#selenium_musix('Gopacapulco',"Ariel Pink's Haunted Graffiti")

#pd.DataFrame.from_dict(lyrics_html_dict, orient='index').to_csv('test')


#     for result in results:
#         if song_title in result.find_element_by_class_name('title').text.lower().strip() \
#             and artist_name in result.find_element_by_class_name('artist').text.lower().strip():
#         else:
#             print('no match')

def scrape_lyrics_musix(song_title, artist_name):
    search_term = (song_title+' '+artist_name).replace(' ','%20')
    search_url = "https://www.musixmatch.com/search/{}".format(search_term)
    
    try:
        page = requests.get(search_url)
        html = BeautifulSoup(page.text, 'html.parser')
        results = html.find_all('div', class_='media-card-text')
        return results
    except:
        return np.nan 

def request_song_info_json_musix(song_title, artist_name):
    api_key = '3db96e8eff58fe05ad464f9e750e9bab'
    base_url = 'http://api.musixmatch.com/ws/1.1/'
    method = 'track.search'
    url = base_url + method + '?apikey=' + api_key
    params = {
        'q_track': song_title,
        'q_artist': artist_name 
             }
    try:
    
        session = requests.Session()
        retry = Retry(connect=3, backoff_factor=0.5)
        adapter = HTTPAdapter(max_retries=retry)
        session.mount('http://', adapter)
        session.mount('https://', adapter)
        response = session.get(url, params=params)
        
        return response.json()
        
    except:
        return False
    
    
def request_trackid_musix(response_json, song_title, artist_name):
    if response_json:
        try:
            hits = response_json['message']['body']['track_list']
            result = False
            for hit in hits:
                if song_title.lower().strip() == hit['track']['track_name'].lower().strip() \
                    and artist_name.lower().strip() == hit['track']['artist_name'].lower().strip():
                    result = hit
                    break
                else:
                    pass
            if result:
                return result['track']['track_id']
            else:
                pass
        except:
            return False
    else:
        return False


def request_lyrics_musix(track_id):
    if track_id:
        api_key = '3db96e8eff58fe05ad464f9e750e9bab'
        base_url = 'http://api.musixmatch.com/ws/1.1/'
        method = 'track.lyrics.get'
        url = base_url + method + '?apikey=' + api_key
        params = {'track_id': track_id}
        
        try:
            session = requests.Session()
            retry = Retry(connect=3, backoff_factor=0.5)
            adapter = HTTPAdapter(max_retries=retry)
            session.mount('http://', adapter)
            session.mount('https://', adapter)
            response = session.get(url, params=params)

            return response.json()['message']['body']['lyrics']['lyrics_body']
        
        except:
            return 'None'
          
    else:
        return np.nan


def get_lyrics_musix(song_title, artist_name):
    
    response = request_song_info_json_musix(song_title, artist_name)
    
    track_id = request_trackid_musix(response, song_title, artist_name)
    
    lyrics = request_lyrics_musix(track_id)
    
    return lyrics



#test = request_song_info_json_musix('sultans of swing', 'dire straits')
#test = get_lyrics_musix('sultans of swing', 'dire straits')
#test = request_trackid_musix(request_song_info_json_musix('sultans of swing', 'dire straits'),'sultans of swing', 'dire straits')
#test

song = 'g'
artist = 'ba'

print(get_lyrics(song, artist))

#### Proxies for Selenium

In [3]:
from lxml.html import fromstring
import requests
from itertools import cycle
import traceback

def get_proxies():
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:150]:
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            proxy = ":".join([i.xpath('.//td[1]/text()')[0], i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies


#If you are copy pasting proxy ips, put in the list below
#proxies = ['121.129.127.209:80', '124.41.215.238:45169', '185.93.3.123:8080', '194.182.64.67:3128', '106.0.38.174:8080', '163.172.175.210:3128', '13.92.196.150:8080']
proxies = get_proxies()
proxy_pool = cycle(proxies)

url = 'https://httpbin.org/ip'
for i in range(1,11):
    response = False
    #Get a proxy from the pool
    
    print("Request #%d"%i)
    while not response:
        proxy = next(proxy_pool)
        try:
            response = requests.get(url,proxies={"http": proxy, "https": proxy})
            print(response.json())
        except:
            #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. 
            #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url 
            pass

Request #1
{'origin': '103.194.88.77'}
Request #2
{'origin': '128.199.217.178'}
Request #3
{'origin': '1.20.99.85'}
Request #4
{'origin': '125.27.251.59'}
Request #5
{'origin': '177.37.160.60'}
Request #6
{'origin': '182.53.197.202'}
Request #7
{'origin': '192.99.167.179'}
Request #8
{'origin': '179.127.240.254'}
Request #9
{'origin': '190.147.251.222'}
Request #10
{'origin': '78.36.6.173'}


In [3]:
proxies

{'1.10.186.203:54298',
 '1.10.188.140:36576',
 '1.10.188.194:30967',
 '1.10.189.84:43051',
 '1.20.103.127:37533',
 '1.20.99.176:49435',
 '1.20.99.83:53738',
 '1.20.99.85:54519',
 '101.255.40.18:61835',
 '101.255.64.246:59748',
 '102.177.96.122:54192',
 '103.102.73.74:46878',
 '103.194.251.43:53142',
 '103.194.88.89:34146',
 '103.209.178.185:42683',
 '103.219.43.72:52287',
 '103.247.217.117:57756',
 '103.44.139.111:55304',
 '103.76.190.178:39514',
 '103.8.115.27:38593',
 '103.9.188.135:46936',
 '105.235.202.54:51113',
 '105.27.154.210:54580',
 '109.207.59.70:53411',
 '109.235.177.2:38676',
 '109.237.92.86:34432',
 '109.69.1.72:45959',
 '110.5.100.130:31444',
 '113.11.156.42:58287',
 '114.30.75.218:51821',
 '117.212.94.233:35114',
 '118.172.227.89:30491',
 '118.173.232.21:40065',
 '118.174.220.42:36237',
 '118.174.220.70:32891',
 '118.174.232.202:40628',
 '118.174.233.45:56444',
 '118.174.233.61:52875',
 '118.174.233.86:37009',
 '118.174.65.131:32533',
 '118.175.207.169:57922',
 '118.175

In [20]:
from selenium import webdriver
from selenium.webdriver.common.proxy import Proxy, ProxyType

PROXY = next(proxy_pool)
print(PROXY)

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--proxy-server=%s' % PROXY)

chrome = webdriver.Chrome(options=chrome_options)
chrome.get("http://google.com")


118.174.220.70:32891


In [7]:
proxy

'78.36.6.173:46312'

### Find Kaggle Lyrics Not Already Added to FMA Dataset

In [40]:
l1 = pd.read_csv('../../../../fma_metadata/every-song-you-have-heard-almost/Lyrics1.csv')
l2 = pd.read_csv('../../../../fma_metadata/every-song-you-have-heard-almost/Lyrics2.csv')
l1.dropna(inplace=True)
l1.Band = l1.Band.map(lambda x: x.lower().strip())
l1.Song = l1.Song.map(lambda x: x.lower().strip())
l2.Band = l2.Band.map(lambda x: x.lower().strip())
l2.Song = l2.Song.map(lambda x: x.lower().strip())



In [41]:
track_ids = pd.read_csv('./Data/track_ids')
track_ids.head()

Unnamed: 0,track_id,artist_name,song_title
0,2,AWOL,Food
1,3,AWOL,Electric Ave
2,5,AWOL,This World
3,10,Kurt Vile,Freeway
4,20,Nicky Cook,Spiritual Level


In [42]:
track_ids.values[:5]

array([[2, 'AWOL', 'Food'],
       [3, 'AWOL', 'Electric Ave'],
       [5, 'AWOL', 'This World'],
       [10, 'Kurt Vile', 'Freeway'],
       [20, 'Nicky Cook', 'Spiritual Level']], dtype=object)

In [43]:
lyrics_dict = {}
for track in tqdm_notebook(track_ids.values):
    try:
        lyrics = l1[l1.Band == track[1].lower().strip()][l1[l1.Band == track[1].lower().strip()].Song == track[2].lower().strip()].Lyrics.values[0]
        lyrics_dict[track[0]] = lyrics
    except:
        pass
    
kaggle_lyrics = pd.DataFrame.from_dict(lyrics_dict, orient='index')

HBox(children=(IntProgress(value=0, max=106574), HTML(value='')))




In [44]:
len(kaggle_lyrics)

102

In [45]:
kaggle_lyrics

Unnamed: 0,0
751,"Love is real, real is love\r\nLove is feeling,..."
763,"Love is real, real is love\r\nLove is feeling,..."
765,"Love is real, real is love\r\nLove is feeling,..."
3581,Stranded in the storm trying to see through th...
3586,You live in Phnom Penh\r\nYou live in New York...
4032,Nothing\r\nAnd nothing's where you're at\r\nWh...
4760,Candy girl\r\nYou sing that song so well\r\nCa...
4761,One more thing before I go\nOne more thing I'l...
4762,All soldiers\nThey're all gonna die\nAnd all t...
4767,Trying to keep time\r\nCloser than we like\r\n...


In [46]:
kaggle_lyrics.to_csv('kaggle_lyrics_1')