# Project Text Classification: Lyrics model

Build a text classification model to predict the artist from a piece of text.

- Download HTML pages
- Get a list of song urls
- Extract lyrics from song urls
- Convert text to numbers by applying the Bag Of Words method
- Build and train a Naive Bayes classifier
- Balance out your dataset
- Write a command-line interface
- Give a 5-minute lightning talk by the end of the week

In [1]:
import os
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, CategoricalNB

from sklearn.pipeline import make_pipeline

import sys

# Download HTML pages

## Find and save all links for lyrics as text files

In [2]:
def save_all_lyrics(url, directory):
    """
    Download each song lyrics from every lyric html page from the artist page.
    
    Parameters
    ----------
    url = url link of the artist page
    directory = directory where you want to save all the file
    both parameters need to be strings
    
    Returns
    -------
    download song files into chosen directory.
    
    """
    #DOWNLOAD ARTIST URL AS TEXT FILE
    request = requests.get(url)         # Send the request
    with open(str(url.split('/')[-2]) + '.txt', 'w') as file:     # Save the html in a txt file and search in an editor
        file.write(request.text)
        
    #CREATE BEAUTIFULSOUP FOR PARSING AND SELECTING LYRIC LINKS
    text = request.text                # Get donwloaded text file from original url
    artist_soup = BeautifulSoup(text, 'html.parser')    #Use beautifulsoup for parsing
    
    
    #LOOP FOR ADDING ALL LYRIC LINKS INTO A LIST
    links = []   # Create list with links
    for td in artist_soup.find_all('td'):
        if "tal" in td.get('class',[]):                                      # selection according to parsing
              links.append('https://www.lyrics.com' + td.find('a')['href'])  # append each link into the list with complete url
                
    
    #LOOP TO CREATE LYRIC TEXT FILES FOR EACH LYRIC LINK
    for i in range(len(links)):
        temp_url = links[i]              #create temporary links for each link on the links list
        title = temp_url.split('/')[-1]  #create title based on the temp lyric url

        temp_req = requests.get(temp_url) #request each temp lyric link

        with open(directory + title + '.txt',  'w') as file:
            soup_artist = BeautifulSoup(temp_req.text)    #create a bsoup out of each lyric file
            lyrics = soup_artist.pre.get_text()           #get only the text from lyrics
            file.writelines(lyrics)

In [3]:
save_all_lyrics('https://www.lyrics.com/artist/Black-Sabbath/3693', 'BLACKSabbath/')

In [4]:
save_all_lyrics('https://www.lyrics.com/artist/Led-Zeppelin/4739', 'LEDZeppelin/')

In [5]:
save_all_lyrics('https://www.lyrics.com/artist/Funkadelic/4323', 'FUNKADELIC/')

## Create a list with every lyrics corpus and a list with the artist name

In [6]:
corpus = []
label = []

In [7]:
def create_corpuslist(directory, artist_name):
    """
    Create a list out of every song lyrics downloaded.
    
    Parameters
    ----------
    Directory = the directory where you want to get all files from and storage the text
    in the corpus list.Directory must be passed as a string,
    artist_name = name of the artist, must be passed as string
    
    Returns
    -------
    a list, in which each item is a song lyric corpus, and a list with the artist name.

    """
    #LOOP FOR ADDING LYRIC FILES INTO A LIST
    list = os.listdir(directory)
    list_size = len(label)  #original size
    
    for i in range(len(list)):
        title = list[i]
        label.append(artist_name)

        with open(directory + title,'r') as reader:

            doc= reader.read()
            doc.lower()
            doc.split()
            reader.close
            corpus.append(doc)
            
            
    print (artist_name, (len(label) - list_size))
    print ('Do we have as many song lyrics as artist indices?: '
               + str(len(corpus) == len(label)))

In [8]:
create_corpuslist('BLACKSabbath/', 'Black Sabbath')

Black Sabbath 296
Do we have as many song lyrics as artist indices?: True


In [9]:
create_corpuslist('LEDZeppelin/', 'Led Zeppelin')

Led Zeppelin 224
Do we have as many song lyrics as artist indices?: True


In [10]:
create_corpuslist('FUNKADELIC/', 'Funkadelic')

Funkadelic 54
Do we have as many song lyrics as artist indices?: True


# Create vectors

In [11]:
def vectors_and_df(corpus, label):
    """creates vectors for songs and returns dataframe with songs as word vectors 
    by all artists"""
    
    cv = TfidfVectorizer(stop_words="english")
    cv.fit(corpus)
    corpus_vecs = cv.transform(corpus)
    
    return pd.DataFrame(corpus_vecs.todense(), index=label, 
                        columns=cv.get_feature_names()), cv

In [12]:
# Store results into dataframe, keep cv for later prediction
df, cv = vectors_and_df(corpus, label)

## Train/test split

In [13]:
# Define features and target column
X = df
y = df.index

In [14]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2)

# Test for models scores

In [15]:
models_params = {
    "MultinomialNB": {"alpha": 0.005},
    "CategoricalNB": {"alpha": 0.01},
    "RandomForestClassifier": {
        "n_estimators": 500,
        "max_depth": 200,
        "max_features": "auto",
        "n_jobs": -1,
        "random_state": 1,
    },
    "LogisticRegression": {"C": 1e6},
}

def train_models(models_params):
    """trains models on corpus and returns dataframe with scores"""
    
    scores = {}
    for model in models_params:
        if model == "LogisticRegression":
            m = LogisticRegression(**models_params[model])
        elif model == "RandomForestClassifier":
            m = RandomForestClassifier(**models_params[model])
        elif model == "MultinomialNB":
            m = MultinomialNB(**models_params[model])
        elif model == "CategoricalNB":
            m = MultinomialNB(**models_params[model])

        m.fit(Xtrain, ytrain)
        score_train = m.score(Xtrain, ytrain)
        score_test = m.score(Xtest, ytest)
        scores[f"{model}"] = {
            "params": models_params[model],
            "train score": score_train,
            "test score": score_test,
            }
    return pd.DataFrame(scores).T

In [16]:
df_scores = train_models(models_params)
df_scores

Unnamed: 0,params,train score,test score
MultinomialNB,{'alpha': 0.005},1.0,0.930435
CategoricalNB,{'alpha': 0.01},0.997821,0.93913
RandomForestClassifier,"{'n_estimators': 500, 'max_depth': 200, 'max_f...",1.0,0.930435
LogisticRegression,{'C': 1000000.0},1.0,0.913043


# Train data with best model

In [17]:
# Train on most promising model
model = "MultinomialNB"
m = MultinomialNB(**models_params[model])
m.fit(X, y)
m.score(X, y)

0.9930313588850174

## Prediction model

In [18]:
def predict(new_text):
    
    """
    Takes the pre-trained model pipeline and predicts new artist based on unseen text.
    
    Parameters
    ----------
    model : Trained scikit-learn model pipeline.
    new_text : str
    
    Returns
    ---------
    prediction : str
    
    """
    songlyrics = [new_text]
    # transform song into vector matrix
    new_song_vecs = cv.transform(songlyrics)
    ynew = new_song_vecs.todense()
    
    prediction = m.predict(ynew)
    
    return prediction[0]

In [20]:
if __name__ == '__main__':
    # Whatever happens after this line, execute it when running "python lyrics_classifier.py"
    # and DO NOT execute these lines of code if things from this script are imported from other scripts.
    
    user_input = input('Please Enter Some Text: ')

    prediction = predict(user_input)
    print('Here is your prediction!')
    print(prediction)

Please Enter Some Text: Now I've got that feeling once again
Here is your prediction!
Black Sabbath
