## Merging the dataset with genre column

In [None]:
# import libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# read csv file
df = pd.read_csv('data/training_data.csv')

### Defining functions for pre-processing

In [None]:
def convert_to_lowercase(song_lyrics):
    '''
      Convert all lyrics to lower text
    '''
    return str(song_lyrics.str.lower())

In [None]:
# lemmatization 
def lemmatize(song_lyrics):
    '''
      Lemmatize words into their lemmas
    '''
    def get_wordnet_pos(word):
      '''
        Identify word as Adj, noun or verb. This improves the lemmatization 
        process.
      '''
      tag = nltk.pos_tag([word])[0][1][0].upper()
      tag_dict = {"J": wordnet.ADJ,
                  "N": wordnet.NOUN,
                  "V": wordnet.VERB,
                  "R": wordnet.ADV}

      return tag_dict.get(tag, wordnet.NOUN)

    res = []
    lemmatizer = WordNetLemmatizer()
    for sent in song_lyrics:
      res.append([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sent)])

    return res

In [None]:
def remove_end_of_line(song_lyrics):
  '''
    Remove end of line symbol. Remove special characters. 
  '''
  song_lyrics = song_lyrics.replace('\n\n','. ')
  song_lyrics = re.sub(r'[^\w\s\.]', '', song_lyrics)
  return song_lyrics.replace('\n','. ')

In [None]:
# Prepare a set of stop words
stop_words = set(stopwords.words('english'))
stop_words_without_punct = set()
for word in stop_words:
  stop_words_without_punct.add(word)
  stop_words_without_punct.add(remove_end_of_line(word))

In [None]:
def remove_stop_words_and_lemmatize(song_lyrics):
  '''
    Remove stop words from lyrics. Combine lemmatization function from above 
    along with stop word removal.
  '''
  def get_wordnet_pos(word):
      tag = nltk.pos_tag([word])[0][1][0].upper()
      tag_dict = {"J": wordnet.ADJ,
                  "N": wordnet.NOUN,
                  "V": wordnet.VERB,
                  "R": wordnet.ADV}

      return tag_dict.get(tag, wordnet.NOUN)
  stop_words = set(stopwords.words('english'))
  lemmatizer = WordNetLemmatizer()
  res = []
  for sent in song_lyrics:
    new_sent = []
    for word in sent.split():
      if word not in stop_words_without_punct:
        new_sent.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))
    res.append(new_sent)
  return res

In [None]:
def tokenize_each_sent(song_lyrics):
  '''
    Tokenize each sentence as a list of words.
  '''
  res = []
  for sent in song_lyrics:
    tokenized_sent = word_tokenize(sent)
    res.append(tokenized_sent)
  return res

In [None]:
def remove_square_brackets(song_lyrics):
  '''
    Some lyrics have lyrics indicators like [Intro] which is not required.
    This function removes the square brackets and anything in between them.
  '''
  pattern = r'\[.*?\]'
  return re.sub(pattern, '', song_lyrics)

In [None]:
def remove_period_at_end(song_lyrics):
  '''
    Remove period at the end of sentences. 
  '''
  res = []
  for sent in song_lyrics:
    res.append(sent[:-1])
  return res

### Combine all functions in order

In [None]:
def preprocessing(song_lyrics):
  '''
    Call all the functions based on the order of pre-processing.
  '''
  song_lyrics = convert_to_lowercase(song_lyrics)
  song_lyrics = remove_end_of_line(song_lyrics)
  song_lyrics = remove_square_brackets(song_lyrics)
  song_lyrics_tokenized = sent_tokenize(song_lyrics)
  song_lyrics_tokenized = remove_period_at_end(song_lyrics_tokenized)
  song_lyrics_tokenized = remove_stop_words_and_lemmatize(song_lyrics_tokenized)
  return song_lyrics_tokenized

### Apply preprocessing function on the Lyric column

In [None]:
df['LyricProcessed'] = df['Lyric'].apply(lambda x: preprocessing(x))

df.to_csv('training_data_processed.csv')

print("SAVING COMPLETE!")

df.head(10)

SAVING COMPLETE!


Unnamed: 0.1,Unnamed: 0,Artist,SName,Lyric,Genre,LyricProcessed
0,0,Ivete Sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,pop,"[[feel, unsure], [take, hand, lead, dance, flo..."
1,1,Ivete Sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",pop,"[[let, fool, ya], [even, try, school, ya, oh],..."
2,2,Ivete Sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",pop,"[[baby, let, cruise, away], [confuse, way, cle..."
3,3,Ivete Sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",pop,"[[know, sound, funny], [cant, stand, pain], [g..."
4,4,Ivete Sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,pop,"[[get, look], [one, hop, lad], [face, beam], [..."
5,5,Ivete Sangalo,Human Nature,Looking out\nAcross the night time\nThe city w...,pop,"[[look], [across, night, time], [city, wink, s..."
6,6,Ivete Sangalo,Losing Control (Miss Cady feat. Ivete Sangalo),"Uh, yeah.\nGo, go, go.\nUh, yeah.\nUh, Uh, Uhh...",pop,"[[uh, yeah.., go, go, go.., uh, yeah.., uh, uh..."
7,7,Ivete Sangalo,Master Blaster (Jammin'),Everyone's feeling pretty\nIt's hotter than Ju...,pop,"[[everyones, feel, pretty], [hotter, july], [t..."
8,8,Ivete Sangalo,More Than Words,Saying 'I Love you'\nIs not the words I want t...,pop,"[[say, love], [word, want, hear], [want, say],..."
9,9,Ivete Sangalo,Natural Collie,Been down in the valley\nSmoking natural colli...,spanish,"[[valley], [smoking, natural, collie], [get, i..."


In [None]:
# drop extra unnamed column
final_df = df.drop(columns=df.columns[0], axis=1)
final_df.head(10)

Unnamed: 0,Artist,SName,Lyric,Genre,LyricProcessed
0,Ivete Sangalo,Careless Whisper,I feel so unsure\nAs I take your hand and lead...,pop,"[[feel, unsure], [take, hand, lead, dance, flo..."
1,Ivete Sangalo,Could You Be Loved / Citação Musical do Rap: S...,"Don't let them fool, ya\nOr even try to school...",pop,"[[let, fool, ya], [even, try, school, ya, oh],..."
2,Ivete Sangalo,Cruisin' (Part. Saulo),"Baby, let's cruise, away from here\nDon't be c...",pop,"[[baby, let, cruise, away], [confuse, way, cle..."
3,Ivete Sangalo,Easy,"Know it sounds funny\nBut, I just can't stand ...",pop,"[[know, sound, funny], [cant, stand, pain], [g..."
4,Ivete Sangalo,For Your Babies (The Voice cover),You've got that look again\nThe one I hoped I ...,pop,"[[get, look], [one, hop, lad], [face, beam], [..."
5,Ivete Sangalo,Human Nature,Looking out\nAcross the night time\nThe city w...,pop,"[[look], [across, night, time], [city, wink, s..."
6,Ivete Sangalo,Losing Control (Miss Cady feat. Ivete Sangalo),"Uh, yeah.\nGo, go, go.\nUh, yeah.\nUh, Uh, Uhh...",pop,"[[uh, yeah.., go, go, go.., uh, yeah.., uh, uh..."
7,Ivete Sangalo,Master Blaster (Jammin'),Everyone's feeling pretty\nIt's hotter than Ju...,pop,"[[everyones, feel, pretty], [hotter, july], [t..."
8,Ivete Sangalo,More Than Words,Saying 'I Love you'\nIs not the words I want t...,pop,"[[say, love], [word, want, hear], [want, say],..."
9,Ivete Sangalo,Natural Collie,Been down in the valley\nSmoking natural colli...,spanish,"[[valley], [smoking, natural, collie], [get, i..."


### Save dataset

In [None]:
final_df.to_csv('data/training_data_processed_stop_word_removed.csv')