In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
import time
from tqdm import tqdm
from multiprocessing.dummy import Pool as ThreadPool
import random
from itertools import cycle
from requests.auth import HTTPProxyAuth
import unicodedata
import re

Component C:

Lyrics scraping from Musixmatch.com

Step1: 
Use ready csv file with list of artists and songs that need to be scraped
Use own list of proxies to avoid ban by musixmatch.com
Use own list of user agents to avoid ban by musixmatch.com

In [2]:
CSV_FILE_NAME = 'track_artist_audio_features_cleaned.csv'
PROXIES = 'proxies.txt'
USER_AGENT = 'user_agents.txt'

# Getting list of proxies (purchased from webshare.com)
with open(PROXIES, 'r') as f: 
    list_proxy = f.read().splitlines()

with open(USER_AGENT, 'r') as f:
    user_agents = f.read().splitlines()

Step2:
Preprocess song/artist names, so that they can be used for musixmatch URL generation

In [3]:
def process_str(str):
  '''
  Removes all the special characters from the string, such as $, --
  '''
  
  return str.replace('--', '').replace("'", '').replace('(', '') \
          .replace(')', '').replace('.', '').replace('!','') \
          .replace('/', '-').replace('[', '').replace(']', '') \
          .replace(',', '').replace('"', '').replace('*', '') \
          .replace('$', '-')

def slugify(value, allow_unicode=False):
    '''
    Formats the string to be suitable for naming .txt file
    Convert to ASCII. 
    Convert spaces or repeated dashes to single dashes. 
    Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. 
    Strips leading and trailing whitespace, dashes, and underscores.
    '''
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')

def get_lyrics(data):
  '''
  Accepts list containing artist_name and song
  Creates multiple threads
  Initiates web scraping
  Displays data retrieving completion status
  Returns lyrics of this song retrieved from musixmatch.com

  '''
  lyrics = []

  def return_music_info(data):
    '''
    Helper function to utilize multithreading and display progress in real time using tqdm and imap

    '''
    artist_name, song = data
    sleep_interval = (4 + 2 * random.random()) # random float number between 1 and 3
    time.sleep(sleep_interval) # random pause to avoid IP banning by website
    output = [song, artist_name, scrape_lyrics(artist_name, song)]
    return output

  pool = ThreadPool(20) # 20 threads to speed up the process
  lyrics = tqdm(pool.imap(return_music_info, data), total=len(data)) #display progress of process

  return lyrics


def write_to_file(lyrics, song_name, artist_name):
    '''
    Accepts lyrics (str), song_name (str), artist_name (str)
    Writes lyrics to .txt file

    '''
    with open(slugify(f'{song_name} - {artist_name}') + '.txt', 'w', encoding="utf-8-sig") as f:
        f.writelines(lyrics)

def write_failed(song_name, artist_name):
    '''
    Accepts lyrics (str), song_name (str), artist_name (str)
    Writes lyrics which could not be retrieved to .txt file

    '''
    with open(f'__failed__.txt', 'a', encoding="utf-8-sig") as f:
        f.write(f'{song_name} - {artist_name}\n')


error_artist = []

def scrape_lyrics(artist_name, song):
  '''
  Accepts artist name and song
  Scrapes musixmatch website
  Returns lyrics of given (artist, song)

  '''

  song = process_str(song)
  artist_name = process_str(artist_name)
  url = f'https://www.musixmatch.com/lyrics/{artist_name}/{song}'
  headers = {'User-Agent': random.choice(user_agents)} #imitating real browser request. randomly selected browser agent

  proxy_data = random.choice(list_proxy) #take random proxy from the list
  proxy = ':'.join(proxy_data.split(':')[:2]) #retract proxy
  username, password = proxy_data.split(':')[2], proxy_data.split(':')[3] #retract username, password
  proxies = {
      "http": 'http://' + proxy,
    }
  auth = HTTPProxyAuth(username, password)
  r = requests.post(url, headers=headers, proxies=proxies, auth=auth)
  if r:
      
    soup = bs(r.content, 'html5lib') #getting webpage content using BeautifulSoup and html5lib decoding 
    lyrics_parts = soup.select('.mxm-lyrics__content ') #selecting lyrics parts from webpage 

    #join parts of lyrics
    result = []
    for part in lyrics_parts:   
      result.append(part.text)
    lyrics = ' '.join(result)
    
    #writing lyrics into separate file
    if lyrics:
      write_to_file(lyrics, song, artist_name)
    else:
      print('Warning! No lyrics!', url)
      write_failed(song, artist_name)

    return lyrics
    
  else:
    print('Error! ', r.status_code, artist_name, song)
    error_artist.append((artist_name, song))


def get_songs_data(csv_file_name):
  '''
  Generates list of [artist, song] pairs from the csv file
  '''

  data_list = []
  
  df = pd.read_csv(csv_file_name) #reading song name, artist from the csv file
  
  for i in range(len(df)):
    artist_name = df.loc[i]['artist_name'].casefold().replace(' ', '-')
    song = df.loc[i]['song'].casefold().replace(' ', '-')   
    
    data_list.append([artist_name, song])

  return data_list



In [6]:
# This cell initiates the whole lyrics scraping process and gets the lyrics from musixmatch.com
# All the lyrics are going to be saved on your device in separate .txt files
# You can get blocked by the musixmatch.com after several requests
# To stop the process press RESTART on the toolbar
data = get_songs_data(CSV_FILE_NAME)
get_lyrics(data) 

  0%|          | 0/7669 [00:00<?, ?it/s]

<tqdm.std.tqdm at 0x2428ecad760>

 https://www.musixmatch.com/lyrics/bill-conti/gonna-fly-now-theme-from-rocky---remastered-2006
 https://www.musixmatch.com/lyrics/train/hey-soul-sister
 https://www.musixmatch.com/lyrics/taylor-swift/style
 https://www.musixmatch.com/lyrics/ed-sheeran/perfect
