In [40]:
import requests
from bs4 import BeautifulSoup

import numpy as np

import pandas as pd

import re

import time

import tqdm
import threading

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# if set to true the lyrics are extracted again and csv files
# are refreshed with the extracted lyrics
g_refresh_lyrics = False

def extract_lyrics_from_url(url, songs, i):

    # time.sleep(10)
    print("Extrating from ", url)
    soup = BeautifulSoup(requests.get(url).text)
    lyrics = ""
    lyrics_tag = soup.find('pre', attrs={'id' : 'lyric-body-text'})
    if lyrics_tag :
        for child in lyrics_tag.children:
            lyrics += child.text
    songs[i] = lyrics


def extract_songs(artist):
    artist_url = 'https://www.lyrics.com/artist/' + artist
    artist_html = requests.get(artist_url).text

    soup = BeautifulSoup(artist_html)
    songs = dict()
    square_bracket_pattern = ' [\[].*[\]]'
    link_constant = 'https://www.lyrics.com/'
    # for song in soup.find_all('strong'):
    for song in soup.find_all('td', attrs={'class':'tal qx'}):
        a = song.find('strong').find('a')
        if a and not (re.findall(square_bracket_pattern, a.text)) :
            songs[a.text.lower()] = link_constant + a.get('href')
    songs_df = pd.DataFrame(columns=["Title", "Link"])
    songs_df['Title'] = songs.keys()
    songs_df['Link'] = songs.values()

    # each thread extracts lyrics from each url
    all_lyrics = [None] * songs_df['Link'].shape[0]
    threads = []
    for index, url in enumerate(songs_df['Link'].values) :
        t = threading.Thread(target=extract_lyrics_from_url, args=[url,all_lyrics,index])
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()

    songs_df["Lyrics"] = all_lyrics

    return songs_df

def print_hypermaters_search_results(results):
    print('BEST MODEL PARAMETERS: {}\n'.format(results.best_params_))
    means = results.cv_results_['mean_test_score']
    for mean, params in zip(means, results.cv_results_['params']):
        print('{}  for {}'.format(round(mean, 4), params))

In [41]:
if g_refresh_lyrics :
    imagine_dragons_df = extract_songs('Imagine-Dragons')
    imagine_dragons_df.to_csv("../data/imagine_dragons_songs.csv")

    linkin_park_df = extract_songs('Linkin-Park')
    linkin_park_df.to_csv("../data/linkin_park_songs.csv")
else :
    imagine_dragons_df = pd.read_csv("../data/imagine_dragons_songs.csv")
    linkin_park_df = pd.read_csv("../data/linkin_park_songs.csv")

In [42]:
# Create the lyrics data base
df = pd.concat([imagine_dragons_df,linkin_park_df])
# X = df[["Lyrics"]]

# convert the lyrics column type to string otherwise it is considered
# as float
df = df.assign(Lyrics = df["Lyrics"].astype(str))
# remove all the \r from the lyrics
X = df['Lyrics'].apply(lambda x : x.replace("\r", ""))

# create targets
y_true = pd.Series([1] * imagine_dragons_df.shape[0] + [0] * linkin_park_df.shape[0])


In [43]:


X_train, X_test, y_train, y_test = train_test_split(X, y_true, random_state=42)


In [44]:

vectorizer = CountVectorizer(lowercase=True, stop_words='english', token_pattern='[A-Za-z]+', ngram_range=(1,1))

nb_classifying_pipeline = Pipeline([
    ('vect', CountVectorizer(lowercase=True, stop_words='english', token_pattern='[A-Za-z]+', ngram_range=(1,1))),
    ('model', MultinomialNB())])


nb_classifying_pipeline.fit(X_train, y_train)
y_pred = nb_classifying_pipeline.predict(X_test)
accuracy_score(y_test, y_pred)


0.8307692307692308

In [45]:
sgd_classifying_pipeline = Pipeline([
    ('vect', CountVectorizer(lowercase=True, stop_words='english', token_pattern='[A-Za-z]+', ngram_range=(1,1))),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None))])

# sgd_classifying_pipeline.fit(X_train, y_train)
# y_pred = sgd_classifying_pipeline.predict(X_test)
# accuracy_score(y_test, y_pred)

In [46]:
sgd_parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
'clf__alpha': (1e-2, 1e-3) }

sgd_grid_search_clf = GridSearchCV(sgd_classifying_pipeline, sgd_parameters, cv=5, n_jobs=-1, scoring='accuracy')

sgd_grid_search_clf.fit(X_train, y_train)

print_hypermaters_search_results(sgd_grid_search_clf)

BEST MODEL PARAMETERS: {'clf__alpha': 0.001, 'vect__ngram_range': (1, 2)}

0.7217  for {'clf__alpha': 0.01, 'vect__ngram_range': (1, 1)}
0.7677  for {'clf__alpha': 0.01, 'vect__ngram_range': (1, 2)}
0.7424  for {'clf__alpha': 0.001, 'vect__ngram_range': (1, 1)}
0.7729  for {'clf__alpha': 0.001, 'vect__ngram_range': (1, 2)}


In [47]:

y_pred = sgd_grid_search_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8615384615384616