In [105]:
import requests
from bs4 import BeautifulSoup

import numpy as np

import pandas as pd

import re

import time

import tqdm
import threading

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# if set to true the lyrics are extracted again and csv files
# are refreshed with the extracted lyrics
g_refresh_lyrics = False

def extract_lyrics_from_url(url, songs, i):

    # time.sleep(10)
    print("Extrating from ", url)
    soup = BeautifulSoup(requests.get(url).text)
    lyrics = ""
    lyrics_tag = soup.find('pre', attrs={'id' : 'lyric-body-text'})
    if lyrics_tag :
        for child in lyrics_tag.children:
            lyrics += child.text
    songs[i] = lyrics


def extract_songs(artist):
    artist_url = 'https://www.lyrics.com/artist/' + artist
    artist_html = requests.get(artist_url).text

    soup = BeautifulSoup(artist_html)
    songs = dict()
    square_bracket_pattern = ' [\[].*[\]]'
    link_constant = 'https://www.lyrics.com/'
    # for song in soup.find_all('strong'):
    for song in soup.find_all('td', attrs={'class':'tal qx'}):
        a = song.find('strong').find('a')
        if a and not (re.findall(square_bracket_pattern, a.text)) :
            songs[a.text.lower()] = link_constant + a.get('href')
    songs_df = pd.DataFrame(columns=["Title", "Link"])
    songs_df['Title'] = songs.keys()
    songs_df['Link'] = songs.values()

    # each thread extracts lyrics from each url
    all_lyrics = [None] * songs_df['Link'].shape[0]
    threads = []
    for index, url in enumerate(songs_df['Link'].values) :
        t = threading.Thread(target=extract_lyrics_from_url, args=[url,all_lyrics,index])
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()

    songs_df["Lyrics"] = all_lyrics

    return songs_df

In [106]:
if g_refresh_lyrics :
    imagine_dragons_df = extract_songs('Imagine-Dragons')
    imagine_dragons_df.to_csv("../data/imagine_dragons_songs.csv")

    linkin_park_df = extract_songs('Linkin-Park')
    linkin_park_df.to_csv("../data/linkin_park_songs.csv")
else :
    imagine_dragons_df = pd.read_csv("../data/imagine_dragons_songs.csv")
    linkin_park_df = pd.read_csv("../data/linkin_park_songs.csv")

In [107]:
# Create the lyrics data base
df = pd.concat([imagine_dragons_df,linkin_park_df])
X = df[["Lyrics"]]

# convert the lyrics column type to string otherwise it is considered
# as float
X = X.assign(Lyrics = X["Lyrics"].astype(str))
# remove all the \r from the lyrics
X['Lyrics'] = X['Lyrics'].apply(lambda x : x.replace("\r", ""))

y_true = pd.Series([1] * imagine_dragons_df.shape[0] + [0] * linkin_park_df.shape[0])


In [108]:


X_train, X_test, y_train, y_test = train_test_split(X, y_true, random_state=42)


In [109]:

vectorizer = CountVectorizer(lowercase=True, stop_words='english', token_pattern='[A-Za-z]+', ngram_range=(1,1))

X_train_cv = vectorizer.fit_transform(X_train["Lyrics"])
# df_bow_sklearn = pd.DataFrame(X_train_cv.toarray(), columns=vectorizer.get_feature_names_out())
# df_bow_sklearn.head()


logreg_cv = LogisticRegression(max_iter=6000)
logreg_cv.fit(X_train_cv, y_train)

X_test_cv = vectorizer.transform(X_test["Lyrics"])

logreg_cv.score(X_test_cv, y_test)

0.7692307692307693