In [9]:
import gensim
import pandas as pd
import numpy as np

In [2]:
# The data
df = pd.read_csv('songdata.csv')

print(df.head())


  artist                   song                                        link  \
0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
3   ABBA                   Bang                  /a/abba/bang_20598415.html   
4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   

                                                text  
0  Look at her face, it's a wonderful face  \nAnd...  
1  Take it easy with me, please  \nTouch me gentl...  
2  I'll never know why I had to go  \nWhy I had t...  
3  Making somebody happy is a question of give an...  
4  Making somebody happy is a question of give an...  


In [31]:
class DataLoader:
    '''Class for loading and preprocessing lyrics from an input csv file.'''
    def __init__(self, input_file='songdata.csv'):
        self.df = pd.read_csv(input_file)
        self.num_songs = self.df.shape[0]
        print('{} songs loaded!'.format(self.num_songs))
    
    def _id(self):
        '''Create unique ID for each song using the link in the data.'''
        self.links = list(self.df['link'].str.split('/'))
        # Create the ID from the last entry, strip .html part
        create_id = lambda l: l[-1].strip('.html')
        self.ids = list(map(create_id, self.links))
        return self.ids
        
    def _preprocess(self):
        '''Pre-process the lyrics. Create a unique ID for each song using the link in the data.
        
        Returns a dictionary which maps (artist, song_name, ID)
        to the lyrics (list of words).'''
        self.dict = {}
        self.artists = list(self.df['artist'])
        self.song_names = list(self.df['song'])
        self.ids = self._id()
        self.lyrics = self.df['text'] 
        
        for idx, lyric in enumerate(self.lyrics):
            if idx % 5000 == 0: print('Processing lyrics: {}'.format(idx))            
            # Feed each string into the cool gensim preprocessor:
            # https://radimrehurek.com/gensim/utils.html#gensim.utils.simple_preprocess
            
            processed_lyrics = gensim.utils.simple_preprocess(lyric)
            
            # Fill the dictionary
            self.dict[(self.artists[idx], self.song_names[idx], self.ids[idx])] = processed_lyrics
        
        print('Pre-processing complete!')
        return self.dict
    
    def _aggregate(self, data):
        '''Aggregate the lyrics into a 2D array.'''
        # Initialize the 2D list
        word_list_2d = []
        for word_list in list(data.values()):
            word_list_2d.append(word_list)
        
        return word_list_2d
    
    def get_data(self):
        '''Get a tuple containing: 
        Dictionary that maps (artist, songname, ID) --> lyrics
        2D set of lyrics as list of words.
        '''
        self.dict = self._preprocess()
        self.word_list_2d = self._aggregate(self.dict)
        return (self.dict, self.word_list_2d)
    

In [32]:
loader = DataLoader()
data = loader.get_data()

57650 songs loaded!
Processing lyrics: 0
Processing lyrics: 5000
Processing lyrics: 10000
Processing lyrics: 15000
Processing lyrics: 20000
Processing lyrics: 25000
Processing lyrics: 30000
Processing lyrics: 35000
Processing lyrics: 40000
Processing lyrics: 45000
Processing lyrics: 50000
Processing lyrics: 55000
Pre-processing complete!


In [33]:
# Take a look at few entries in the data 

# The dictionary

print(list(data[0].keys())[0])
print(list(data[0].values())[0])


('ABBA', "Ahe's My Kind Of Girl", 'ahes+my+kind+of+girl_20598417')
['look', 'at', 'her', 'face', 'it', 'wonderful', 'face', 'and', 'it', 'means', 'something', 'special', 'to', 'me', 'look', 'at', 'the', 'way', 'that', 'she', 'smiles', 'when', 'she', 'sees', 'me', 'how', 'lucky', 'can', 'one', 'fellow', 'be', 'she', 'just', 'my', 'kind', 'of', 'girl', 'she', 'makes', 'me', 'feel', 'fine', 'who', 'could', 'ever', 'believe', 'that', 'she', 'could', 'be', 'mine', 'she', 'just', 'my', 'kind', 'of', 'girl', 'without', 'her', 'blue', 'and', 'if', 'she', 'ever', 'leaves', 'me', 'what', 'could', 'do', 'what', 'could', 'do', 'and', 'when', 'we', 'go', 'for', 'walk', 'in', 'the', 'park', 'and', 'she', 'holds', 'me', 'and', 'squeezes', 'my', 'hand', 'we', 'll', 'go', 'on', 'walking', 'for', 'hours', 'and', 'talking', 'about', 'all', 'the', 'things', 'that', 'we', 'plan', 'she', 'just', 'my', 'kind', 'of', 'girl', 'she', 'makes', 'me', 'feel', 'fine', 'who', 'could', 'ever', 'believe', 'that', 'she

In [34]:
# First few entries in the 2D array

print(data[1][:2])

loader.num_songs

[['look', 'at', 'her', 'face', 'it', 'wonderful', 'face', 'and', 'it', 'means', 'something', 'special', 'to', 'me', 'look', 'at', 'the', 'way', 'that', 'she', 'smiles', 'when', 'she', 'sees', 'me', 'how', 'lucky', 'can', 'one', 'fellow', 'be', 'she', 'just', 'my', 'kind', 'of', 'girl', 'she', 'makes', 'me', 'feel', 'fine', 'who', 'could', 'ever', 'believe', 'that', 'she', 'could', 'be', 'mine', 'she', 'just', 'my', 'kind', 'of', 'girl', 'without', 'her', 'blue', 'and', 'if', 'she', 'ever', 'leaves', 'me', 'what', 'could', 'do', 'what', 'could', 'do', 'and', 'when', 'we', 'go', 'for', 'walk', 'in', 'the', 'park', 'and', 'she', 'holds', 'me', 'and', 'squeezes', 'my', 'hand', 'we', 'll', 'go', 'on', 'walking', 'for', 'hours', 'and', 'talking', 'about', 'all', 'the', 'things', 'that', 'we', 'plan', 'she', 'just', 'my', 'kind', 'of', 'girl', 'she', 'makes', 'me', 'feel', 'fine', 'who', 'could', 'ever', 'believe', 'that', 'she', 'could', 'be', 'mine', 'she', 'just', 'my', 'kind', 'of', 'girl

57650