In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

import re
import time
import random
import os

In [3]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
url_base = 'http://www.lyrics.com'
radiohead_path = '/artist/Radiohead/41092'
tool_path = '/artist/Tool/23076'

In [154]:
def get_artist(url_path, directory):
    response_artist = requests.get(url_base + url_path, headers=headers)
    soup_artist = BeautifulSoup(response_artist.text, 'html.parser')
    song_links = get_artist_song_links(soup_artist)
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f'**Created directory: {directory}**')
    save_lyrics(song_links, directory)
    
def get_artist_song_links(soup):
    song_links = {}
    for link in soup.find_all('a'):
        if (link['href'].startswith('/lyric')):
            if song_links.get(link.text):
                song_links[link.text].append(url_base + link['href'])
            else:
                song_links[link.text] = [url_base + link['href']]
    return song_links

def save_lyrics(song_links, directory):  
    for i, title in enumerate(song_links):
        print(f'**Looking for {title}** {i - 1}/{len(song_links)}')
        for title_link in song_links[title]:
            filename = re.sub(r'[/:]', '_', title + '.txt')
            if filename in os.listdir(f'{directory}/'):
                print('**File already in directory**')
                break
            else:
                time.sleep(random.random())
                response_song = requests.get(title_link, headers=headers)
                song_soup = BeautifulSoup(response_song.text, 'html.parser')
                if song_soup.find('pre', id='lyric-body-text'):
                    print('**Found Lyrics**')
                    lyrics = song_soup.find('pre', id='lyric-body-text').text
                    with open(f'{directory}/' + filename, 'w') as song_file:
                        song_file.write(lyrics)
                    break

In [150]:
get_artist(tool_path, 'tool')

**Created directory: tool**
**Looking for Fear Inoculum** 0/83
**Found Lyrics**
**Looking for Pneuma** 1/83
**Found Lyrics**
**Looking for Invincible** 2/83
**Found Lyrics**
**Looking for Descending** 3/83
**Found Lyrics**
**Looking for 7empest** 4/83
**Found Lyrics**
**Looking for Culling Voices** 5/83
**Found Lyrics**
**Looking for Parabola** 6/83
**Found Lyrics**
**Looking for Sweat** 7/83
**Found Lyrics**
**Looking for Vicarious** 8/83
**Found Lyrics**
**Looking for Jambi** 9/83
**Found Lyrics**
**Looking for Wings for Marie, Pt. 1** 10/83
**Found Lyrics**
**Looking for 10,000 Days (Wings, Pt. 2)** 11/83
**Found Lyrics**
**Looking for The Pot** 12/83
**Found Lyrics**
**Looking for Lipan Conjuring** 13/83
**Found Lyrics**
**Looking for Lost Keys (Blame Hofmann)** 14/83
**Found Lyrics**
**Looking for Roseta Stoned** 15/83
**Found Lyrics**
**Looking for Intension** 16/83
**Found Lyrics**
**Looking for Right in Two** 17/83
**Found Lyrics**
**Looking for Viginti Tres** 18/83
**Looking f

In [4]:
def build_corpus(directory):
    corpus = []

    for filename in os.listdir(f'{directory}/'):
        if (not filename.startswith('.')) & os.path.isfile(f'{directory}/{filename}'):
            with open(f'{directory}/' + filename, 'r') as f:
                text = f.read() # split and filter out empty lines
                corpus.append(text)
        
    return corpus

In [5]:
radiohead_corpus = build_corpus('radiohead')
tool_corpus = build_corpus('tool')

combined_corpus = radiohead_corpus + tool_corpus

l1,l2 = len(radiohead_corpus), len(tool_corpus)
labels = [f'radiohead_{i}' for i in range(l1)] + [f'tool_{i}' for i in range(l2)] 

In [6]:
radiohead_corpus[0:5]

["That there\nThat's not me\nI go\nWhere I please\n\nI walk through walls\nI float down the Liffey\nI'm not here\nThis isn't happening\n\nI'm not here\nI'm not here\n\nIn a little while\nI'll be gone\nThe moment's already passed\nYeah it's gone\n\nAnd I'm not here\nThis isn't happening\nI'm not here\nI'm not here\n\nStrobe lights and blown speakers\nFireworks and hurricanes\nI'm not here\nThis isn't happening\nI'm not here\nI'm not here",
 "Are you such a dreamer,\nTo put the world to rights?\nI'll stay home forever,\nWhere two and two always makes a five\n\nI'll lay down the tracks,\nSandbag and hide,\nJanuary has April showers,\nAnd two and two always makes a five\n\nIt's the Devil's way now,\nThere is no way out,\nYou can scream and you can shout,\nIt is too late now\n\nBecause you have not been\nPayin' attention,\nPayin' attention,\nPayin' attention,\nPayin' attention\n\nYeah I feel it, I needed attention,\nPayin' attention,\nPayin' attention,\nPayin' attention\n\nYeah I need it, I

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
vector = vectorizer.fit_transform(combined_corpus)

feature_matrix = pd.DataFrame(
    vector.todense(), 
    columns=vectorizer.get_feature_names_out(),
    index=labels
)

In [8]:
feature_matrix['radiohead'] = [int(index.split('_')[0] == 'radiohead') for index in feature_matrix.index]
feature_matrix['radiohead'].value_counts()

1    158
0     59
Name: radiohead, dtype: int64

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [10]:
X_lr = feature_matrix.drop(columns='radiohead')
y_lr = feature_matrix['radiohead']
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X_lr, y_lr, test_size = 0.2, random_state=42, stratify=y_lr)

m = LogisticRegression()
m.fit(X_train_lr, y_train_lr)

In [11]:
m.score(X_test_lr, y_test_lr)

0.7272727272727273

In [12]:
y_train_lr.value_counts(normalize=True)

1    0.728324
0    0.271676
Name: radiohead, dtype: float64

In [13]:
y_test_lr.value_counts(normalize=True)

1    0.727273
0    0.272727
Name: radiohead, dtype: float64

# Naive Bayes

In [14]:
import nltk
#nltk.download('omw-1.4')
#nltk.download("wordnet")
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer

In [15]:
CORPUS = [s.lower() for s in combined_corpus]

In [16]:
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()

CLEAN_CORPUS = []

for doc in CORPUS:
    tokens = tokenizer.tokenize(text=doc)
    clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
    CLEAN_CORPUS.append(clean_doc)

In [17]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

STOPWORDS = stopwords.words('english')

In [18]:
LABELS = [label.split('_')[0] for label in labels]

In [19]:
vectorizer = TfidfVectorizer(stop_words=STOPWORDS)

CLEAN_CORPUS_train, CLEAN_CORPUS_test, LABELS_train, LABELS_test = train_test_split(CLEAN_CORPUS, LABELS, test_size = 0.2, random_state=42, stratify=LABELS)

vectors_train = vectorizer.fit_transform(CLEAN_CORPUS_train)
vectors_test = vectorizer.transform(CLEAN_CORPUS_test)

In [20]:
pd.DataFrame(vectors_train.todense(), columns=vectorizer.get_feature_names_out(), index=LABELS_train)

Unnamed: 0,180,187,1994,21st,3018,58,900,abandon,ability,absolutely,...,yesterday,yet,yiddish,young,younger,yuppie,zauberworter,zion,zugeben,zweihundert
radiohead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
radiohead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
radiohead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
radiohead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
radiohead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
radiohead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
radiohead,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tool,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.060701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
tool,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
model_nb = MultinomialNB()
model_nb.fit(vectors_train, LABELS_train)

In [23]:
model_nb.score(vectors_train, LABELS_train)

0.791907514450867

In [24]:
model_nb.score(vectors_test, LABELS_test)

0.7272727272727273

In [55]:
new_lyrics = ['a roll', 'something has to change']
new_vectors = vectorizer.transform(new_lyrics)
model_nb.predict(new_vectors)

array(['radiohead', 'radiohead'], dtype='<U9')

In [56]:
model_nb.predict_proba(new_vectors)

array([[0.75542138, 0.24457862],
       [0.71550424, 0.28449576]])

In [57]:
model_nb.classes_

array(['radiohead', 'tool'], dtype='<U9')

# Class Balancing

In [58]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

In [85]:
sm = SMOTE(sampling_strategy={'tool': 125}, random_state=42)
vectors_train_smote, LABELS_train_smote = sm.fit_resample(np.asarray(vectors_train.todense()), LABELS_train)

In [60]:
tl = TomekLinks(sampling_strategy='all')
vectors_train_tl, LABELS_train_tl = tl.fit_resample(vectors_train_smote, LABELS_train_smote)

pd.Series(LABELS_train_tl).value_counts()

radiohead    126
tool         125
dtype: int64

In [89]:
model_nb_tl = MultinomialNB()
model_nb_tl.fit(vectors_train_tl, LABELS_train_tl)

In [90]:
model_nb_tl.score(vectors_train_tl,LABELS_train_tl)

0.9880478087649402

In [88]:
model_nb_tl.score(vectors_test,LABELS_test)

0.6363636363636364

In [64]:
model_nb_tl.predict(new_vectors)

array(['radiohead', 'tool'], dtype='<U9')

In [65]:
model_nb_tl.predict_proba(new_vectors)

array([[0.57157327, 0.42842673],
       [0.32147617, 0.67852383]])

In [66]:
model_nb_tl.classes_

array(['radiohead', 'tool'], dtype='<U9')

In [119]:
import pickle

with open('lyrics_classifier.pkl', 'wb') as my_file:
	pickle.dump(model_nb_tl, my_file)

with open('lyrics_classifier_vectorizer.pkl', 'wb') as my_file:
	pickle.dump(vectorizer, my_file)

# Pipeline

In [94]:
from imblearn.pipeline import Pipeline as imbPipeline

pipeline = imbPipeline([
    ('smote', SMOTE(sampling_strategy={'tool': 125}, random_state=42)),
    ('tl', TomekLinks(sampling_strategy='all')),
    ('model_mnb', MultinomialNB())
])

In [95]:
pipeline.fit(vectors_train, LABELS_train)
pipeline.score(vectors_train, LABELS_train)

0.9826589595375722

In [96]:

pipeline.score(vectors_test, LABELS_test)

0.6363636363636364