In [94]:
# Imports
import pandas as pd
from langdetect import detect
import string
import nltk
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import wordnet


In [19]:
# Read data
df = pd.read_csv('Dataset.csv')
print(df.head())

               song        artist  \
0         Let It Go     James Bay   
1      Love Someone  Lukas Graham   
2      Lose Control   Teddy Swims   
3  Beautiful Things  Benson Boone   
4     Make You Mine        PUBLIC   

                                              lyrics  sentiment  
0  From walking home and talking loads\nTo seeing...          0  
1  There are days\nI wake up and I pinch myself\n...          0  
2  Something's got a hold of me lately\nNo, I don...          0  
3  For a while there it was rough\nBut lately, I'...          0  
4  Girl, I will call you darling and everything w...          0  


In [141]:
# Main class for preprocessing
class Preprocessor:

    # Calculate the number of entries that got changed after processing
    def calculate_differences(self, p_list1, p_list2):

        # Number of changes after processing
        count = 0

        #Check how many fields got changed
        for i in range(len(p_list1)):
            if p_list1[i] != p_list2[i]:
                count += 1
        return count
    
    # Function that tags the part of speech
    def pos_tagger(self, nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:          
            return None
        
    # Remove elements thay are not character strings
    def remove_non_str_elements(self, p_list):

        print(f'### Remove elements that are no strings of characters: {sum([0 if isinstance(sentence,str) else 1 for sentence in p_list])} found')
        processed_list = [sentence if isinstance(sentence, str) else '' for sentence in p_list]

        return processed_list
    
    # Remove elements from a different languages than 'lg'
    def remove_different_lg_elem(self, p_list, lg = 'en'):

        # Number of elements from other languages
        detect_count = 0
        
        processed_list = [sentence if sentence != '' and detect(sentence) == 'en' else '' for sentence in p_list]

        for i in range(len(p_list)):
            if processed_list[i] != p_list[i]:
                detect_count += 1

        print(f'### Remove songs from different languages: {detect_count} found.')

        return processed_list
    
    # Remove punctuation from the elements
    def remove_punctuation(self, p_list):

        # String that contains punctuation, whitespace and extra unicode characters
        # 'RIGHT SINGLE QUATATION MARK'
        exclude = string.punctuation + '\u2019'

        processed_list = ["".join(['' if chr in exclude else chr for chr in sentence]) for sentence in p_list]
        print(f'### Remove punctuation')

        return processed_list
    
    # Convert elements to lower case
    def lower_case(self, p_list):
        processed_list = [sentence.lower() for sentence in p_list]
        print(f'### Converting to lowercase')
        return processed_list

    # Function that returns a lemmatized list
    def lemmatize_list(self, p_list):
        
        print("### Lemmatize content")
        # Returned lemmatized list
        processed_list = []
        # Instantiate the lemmatizer class
        lemmatizer = WordNetLemmatizer()

        for sentence in p_list:
            # Use pos_tag to get complete part of speech: 'DT', 'VBZ', 'VBG', ..
            pos_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
            # Process the tags in simpler versions for lemmatization: 'n', 'v', 'a', None, ..
            wordnet_tagged = list(map(lambda x: (x[0], self.pos_tagger(x[1])), pos_tagged))
            lemmatized_sentence = []
            for word, tag in wordnet_tagged:
                if tag is None:
                    # if there is no available tag, append the token as is
                    lemmatized_sentence.append(word)
                else:        
                    # else use the tag to lemmatize the token
                    lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
            lemmatized_sentence = " ".join(lemmatized_sentence)
            processed_list.append(lemmatized_sentence)

        return processed_list

    # Functon that removes numbers from the strings
    def remove_numbers(self, p_list):
        occurences = sum([len(re.findall(r'[0-9]', sentence)) for sentence in p_list])
        processed_list = [re.sub(r'[0-9]', '', sentence) for sentence in p_list] 
        print(f'### Removing numbers: {occurences} found')
        return processed_list
            
    # Replace words with correct versions of them: we ll, do nt ..
    def replace_broken_words(self, p_list):
        print('### Replacing broken words')
        # Words that will be replaced
        replace_dict = {'ll' : 'will', 'nt' : 'not', 'm' : 'am', 've' : 'have', 'ca': 'can', 's' : 'is',
                        'wan' : 'wanna', 'na' : '', 'gon' : 'gonna', 're' : 'are', 'wo' : 'will'}
        # Final replaced lyrics
        processed_list = []
        for sentence in p_list:
            # Processed sentence
            processed_sentence = []
            for word in sentence.split():
                word = replace_dict.get(word, word)
                processed_sentence.append(word)
            processed_sentence = ' '.join(processed_sentence)
            processed_list.append(processed_sentence)
        return processed_list

    # Replacing extra characters to increase collision
    def replace_duplicate_letters(self, p_list):
        # Replacing extra characters
        print('### Replacing extra characters')
        # Replaces "aaaaaahhhhhhhaaaaaa" with "aahhaa"
        processed_list = [re.sub(r'([a-z])\1{3,}', r'\1\1', sentence) for sentence in p_list]
        # Replaces "ahaahaahaaha with "aha"
        #processed_list = [re.sub(r'(.+?)\1+', r'\1', sentence) for sentence in p_list]
        
        return processed_list

    # Main function for preprocessing
    def main(self, p_list, remove_non_strings=True, remove_non_english=True, remove_punctuation=True, lower_case=True, lemmatize=True,
             remove_numbers=True, broken_words=True, replace_extra=True):
        print('## Preprocessing..')

        # Remove non string elements
        if remove_non_strings:
            preprocessed_list = self.remove_non_str_elements(p_list)

        # Remove elements that are in other languages than english
        if remove_non_english:
            preprocessed_list = self.remove_different_lg_elem(preprocessed_list)

        # Remove punctuation from elements
        if remove_punctuation:
            preprocessed_list = self.remove_punctuation(preprocessed_list)
                    
        # Convert characters to lowercase
        if lower_case:
            preprocessed_list = self.lower_case(preprocessed_list)

        # Lemmatize elements
        if lemmatize:
            preprocessed_list = self.lemmatize_list(preprocessed_list)

        # Remove numbers
        if remove_numbers:
            preprocessed_list = self.remove_numbers(preprocessed_list)

        # Replace broken words
        if broken_words:
            preprocessed_list = self.replace_broken_words(preprocessed_list)

        if replace_extra:
            preprocessed_list = self.replace_duplicate_letters(preprocessed_list)

        print()

        return preprocessed_list


In [24]:
# Cache-ing expensive data

preprocessor = Preprocessor() 

# Unprocessed lyrics
raw_lyrics = df['lyrics'].copy()

# Lyrics that are english only, not lematized
english_lyrics = preprocessor.remove_non_str_elements(df['lyrics'])
english_lyrics = preprocessor.remove_different_lg_elem(english_lyrics)

# Lyrics that are lemmatized
lemmatized_lyrics = preprocessor.lemmatize_list(english_lyrics)

### Remove elements that are no strings of characters: 1 found
### Remove songs from different languages: 440 found.


In [142]:
# Lyrics used for preprocessing
USED_LYRICS = lemmatized_lyrics

# Main function 
def main(p_list):
    print('## Main function..')

    # Initialize class used for preprocessing
    preprocessor = Preprocessor()

    # Preprocessing data
    processed_list = preprocessor.main(p_list, remove_non_english=False, lemmatize=False)

    # Sample processed data
    print(processed_list[:10])

    return processed_list

processed_lyrics =  main(USED_LYRICS)

## Main function..
## Preprocessing..
### Remove elements that are no strings of characters: 0 found
### Remove punctuation
### Converting to lowercase
### Removing numbers: 9880 found
### Replacing broken words
### Replacing extra characters

['from walk home and talk load to see show in even clothes with you from nervous touch and get drunk to stay up and wake up with you now we are sleep near the edge holding something we do not need oh this delusion in our head is gonna  bring us to our knee so come on let it go just let it be why do not you be you and i will be me everything that is broke leave it to the breeze why do not you be you and i will be me and i will be me from throw clothes across the floor to teeth and claw and slam door at you if this be all we are live for why be we do it do it do it anymore i use to recognize myself it is funny how reflection change when we are become something else i think it is time to walk away so come on let it go just let it be why do not you b

In [143]:
# Processed dataset example
preprocessed_dataset = df.copy()
preprocessed_dataset['lyrics'] = processed_lyrics
# Remove rows where lyrics were removed based on being preprocessed (dif. language, float..)
preprocessed_dataset.dropna(subset=['lyrics'], inplace=True)

# Export dataset
preprocessed_dataset.to_csv('Preprocessed_Dataset.csv')
