In [1]:
import numpy as np
import pandas as pd
import string
import random
import re
from collections import Counter, defaultdict
import nltk
from nltk.tokenize import word_tokenize 
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.lm import MLE
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.lm.models import LanguageModel
from sklearn.model_selection import train_test_split
import json
%matplotlib inline  

There are 300 songs lyrics in this file.

In [2]:
file = 'Lyrics_Eminem.json'
with open(file) as train_file:
    dict_eminem = json.load(train_file)

In [3]:
eminem_songs = dict_eminem.get('songs')

In [4]:
eminem_lyrics = []
for i in range(0,len(eminem_songs)):
    dicts = eminem_songs[i]
    song_lyric = dicts.get('lyrics')
    eminem_lyrics.append(song_lyric) 

In [5]:
eminem = pd.DataFrame(eminem_lyrics) 
eminem = eminem.dropna()

In [6]:
eminem = eminem.replace(to_replace = '\n', value = ' ', regex = True) #replacing all new line characters
eminem[0] = eminem[0].str.replace('[^\w\s]','') #replacing all things like '[Intro]'
eminem[0] = eminem[0].str.lower() # changing everything to lowercase

In [7]:
eminem_lyrics = eminem[0].to_list() # creating tokens from all the songs, feel like it's redundant as I do it again later
tokens1 = []
for i in range(len(eminem_lyrics)):
    tokens = nltk.word_tokenize(eminem_lyrics[i])
    tokens1.append(tokens)

In [8]:
tokens = []
for sublist in tokens1:
    for token in sublist:
        tokens.append(token)

In [9]:
len(tokens)

234205

In [10]:
train, test = train_test_split(eminem, test_size = 0.1, random_state = 123) #splitting dataset into train and test

In [11]:
tokens_tr = list(train[0].apply(nltk.word_tokenize)) # tokenizing both sets
tokens_te = list(test[0].apply(nltk.word_tokenize))

In [12]:
tokens_test = [] # creating test tokens
for sublist in tokens_te:
    for token in sublist:
        tokens_test.append(token)

In [13]:
tokens_train = [] # creating train tokens
for sublist in tokens_tr:
    for token in sublist:
        tokens_train.append(token)

In [14]:
print(len(tokens_test), len(tokens_train))

22333 211872


## Kneser-Ney smoothing

I want to thank github and Dasha Dobrego for these chunks
https://github.com/nltk/nltk/pull/2363/commits/ce74e449dc9526e19596b1c4a9c510bbb35812cc

In [15]:
class InterpolatedLanguageModel(LanguageModel):
    """Logic common to all interpolated language models.
    The idea to abstract this comes from Chen & Goodman 1995.
    Do not instantiate this class directly!
    """
    def __init__(self, smoothing_cls, order, **kwargs):
        assert issubclass(smoothing_cls, Smoothing)
        params = kwargs.pop("params", {})
        super().__init__(order, **kwargs)
        self.estimator = smoothing_cls(self.vocab, self.counts, **params)

    def unmasked_score(self, word, context=None):
        if not context:
                return self.estimator.unigram_score(word)
        if not self.counts[context]:
#This conversation was marked as resolved by stevenbird  Show conversation
             # It can also happen that we have no data for this context.
             # In that case we defer to the lower-order ngram.
             # This is the same as setting alpha to 0 and gamma to 1.
             return self.unmasked_score(word, context[1:])
        alpha, gamma = self.estimator.alpha_gamma(word, context)
        return alpha + gamma * self.unmasked_score(word, context[1:])

In [16]:
def _count_non_zero_vals(dictionary):
    return sum(1.0 for c in dictionary.values() if c > 0)

In [17]:
from nltk.lm.models import Smoothing
class KneserNey(Smoothing):
    def __init__(self, vocabulary, counter, discount=0.1, **kwargs):
        super(KneserNey, self).__init__(vocabulary, counter, *kwargs)
        super().__init__(vocabulary, counter, **kwargs)
        self.discount = discount

    def unigram_score(self, word):
        return 1.0 / len(self.vocab)

    def alpha_gamma(self, word, context):
        prefix_counts = self.counts[context]
        prefix_total_ngrams = prefix_counts.N()
        alpha = max(prefix_counts[word] - self.discount, 0.0) / prefix_total_ngrams
        gamma = self.discount * _count_non_zero_vals(prefix_counts) / prefix_total_ngrams
        return alpha, gamma

In [18]:
class KneserNeyInterpolated(InterpolatedLanguageModel):
    def __init__(self, order, discount=0.1, **kwargs):
        super().__init__(KneserNey, order, params={"discount": discount}, **kwargs)

## Fivegram model

In [43]:
paddedLine5 = [list(pad_both_ends(tokens_train, n=5))]
train5, vocab5 = padded_everygram_pipeline(5, paddedLine5)

In [44]:
lyrics_fivemod = KneserNeyInterpolated(5, discount = 0.99) 
lyrics_fivemod.fit(train5, vocab5)

In [45]:
test_fivegrams = ngrams(tokens_test, 5)

In [46]:
lyrics_fivemod.perplexity(test_fivegrams)

798.1132655750536

## Fourgram model

In [19]:
paddedLine4 = [list(pad_both_ends(tokens_train, n=4))]
train4, vocab4 = padded_everygram_pipeline(4, paddedLine4)

In [20]:
lyrics_fourmod = KneserNeyInterpolated(4, discount = 0.99) 
lyrics_fourmod.fit(train4, vocab4)

In [21]:
test_fourgrams = ngrams(tokens_test, 4)

In [22]:
lyrics_fourmod.perplexity(test_fourgrams)

801.465368787004

## Trigram model

In [23]:
paddedLine = [list(pad_both_ends(tokens_train, n=3))]
train3, vocab3 = padded_everygram_pipeline(3, paddedLine)

In [24]:
lyrics_trimod = KneserNeyInterpolated(3, discount = 0.99) 
lyrics_trimod.fit(train3, vocab3)

In [25]:
print(lyrics_trimod.counts)

<NgramCounter with 3 ngram orders and 635637 ngrams>


In [26]:
test_trigrams = ngrams(tokens_test, 3)

In [27]:
lyrics_trimod.perplexity(test_trigrams)

813.716714003829

## Bigram model

In [28]:
paddedLine2 = [list(pad_both_ends(tokens_train, n=2))]
train2, vocab2 = padded_everygram_pipeline(2, paddedLine2)

In [29]:
lyrics_bimod = KneserNeyInterpolated(2, discount = 0.99) 
lyrics_bimod.fit(train2, vocab2)

In [30]:
test_bigrams = ngrams(tokens_test, 2)

In [31]:
print(lyrics_bimod.counts)

<NgramCounter with 2 ngram orders and 423751 ngrams>


In [32]:
lyrics_bimod.perplexity(test_bigrams)

891.4658919531347

So far the 5gram models has shown the best results, but they are still pretty bad

## Adding more vocabulary

Seems like there's not enough data, so I'll add more songs of other rappers. Added 800 songs in total. 

Addendum: added another 1k songs after the first 800 didnt do much

Addendum 2.0: added around 1.5k more because the previous 1.8k weren't enough

In [47]:
file = 'Lyrics_KendrickLamar.json'
with open(file) as train_file:
    dict_kendrick = json.load(train_file)
    
kendrick_songs = dict_kendrick.get('songs')

kendrick_lyrics = []
for i in range(0,len(kendrick_songs)):
    dicts = kendrick_songs[i]
    song_lyric = dicts.get('lyrics')
    kendrick_lyrics.append(song_lyric)

kendrick = pd.DataFrame(kendrick_lyrics) 

In [48]:
file = 'Lyrics_KanyeWest.json'
with open(file) as train_file:
    dict_kanye = json.load(train_file)

kanye_songs = dict_kanye.get('songs')

kanye_lyrics = []
for i in range(0,len(kanye_songs)):
    dicts = kanye_songs[i]
    song_lyric = dicts.get('lyrics')
    kanye_lyrics.append(song_lyric) 

kanye = pd.DataFrame(kanye_lyrics) 

In [49]:
file = 'Lyrics_2Pac.json'
with open(file) as train_file:
    dict_2pac = json.load(train_file)
    
dict_2pac_songs = dict_2pac.get('songs')

pac_lyrics = []
for i in range(0,len(dict_2pac_songs)):
    dicts = dict_2pac_songs[i]
    song_lyric = dicts.get('lyrics')
    pac_lyrics.append(song_lyric)
    
pac = pd.DataFrame(pac_lyrics) 

In [50]:
file = 'Lyrics_JAYZ.json'
with open(file) as train_file:
    dict_jayz = json.load(train_file)
    
jay_songs = dict_jayz.get('songs')

jay_lyrics = []
for i in range(0,len(jay_songs)):
    dicts = jay_songs[i]
    song_lyric = dicts.get('lyrics')
    jay_lyrics.append(song_lyric)
    
jay = pd.DataFrame(jay_lyrics) 

In [51]:
file = 'Lyrics_LilWayne.json'
with open(file) as train_file:
    dict_wayne = json.load(train_file)
    
wayne_songs = dict_wayne.get('songs')

wayne_lyrics = []
for i in range(0,len(wayne_songs)):
    dicts = wayne_songs[i]
    song_lyric = dicts.get('lyrics')
    wayne_lyrics.append(song_lyric)
    
wayne = pd.DataFrame(wayne_lyrics) 

In [52]:
file = 'Lyrics_SnoopDogg.json'
with open(file) as train_file:
    dict_snoop = json.load(train_file)
    
snoop_songs = dict_snoop.get('songs')

snoop_lyrics = []
for i in range(0,len(snoop_songs)):
    dicts = snoop_songs[i]
    song_lyric = dicts.get('lyrics')
    snoop_lyrics.append(song_lyric)
    
snoop = pd.DataFrame(snoop_lyrics) 

In [53]:
file = 'Lyrics_50Cent.json'
with open(file) as train_file:
    dict_fifty = json.load(train_file)
    
fifty_songs = dict_fifty.get('songs')

fifty_lyrics = []
for i in range(0,len(fifty_songs)):
    dicts = fifty_songs[i]
    song_lyric = dicts.get('lyrics')
    fifty_lyrics.append(song_lyric)
    
fifty = pd.DataFrame(fifty_lyrics) 

In [54]:
file = 'Lyrics_Dr.Dre.json'
with open(file) as train_file:
    dict_dre = json.load(train_file)
    
dre_songs = dict_dre.get('songs')

dre_lyrics = []
for i in range(0,len(dre_songs)):
    dicts = dre_songs[i]
    song_lyric = dicts.get('lyrics')
    dre_lyrics.append(song_lyric)
    
dre = pd.DataFrame(dre_lyrics) 

In [55]:
file = 'Lyrics_TheNotoriousB.I.G..json'
with open(file) as train_file:
    dict_biggy = json.load(train_file)
    
biggy_songs = dict_biggy.get('songs')

biggy_lyrics = []
for i in range(0,len(biggy_songs)):
    dicts = biggy_songs[i]
    song_lyric = dicts.get('lyrics')
    biggy_lyrics.append(song_lyric)
    
biggy = pd.DataFrame(biggy_lyrics) 

In [56]:
file = 'Lyrics_WizKhalifa.json'
with open(file) as train_file:
    dict_wiz = json.load(train_file)
    
wiz_songs = dict_wiz.get('songs')

wiz_lyrics = []
for i in range(0,len(wiz_songs)):
    dicts = wiz_songs[i]
    song_lyric = dicts.get('lyrics')
    wiz_lyrics.append(song_lyric)
    
wiz = pd.DataFrame(wiz_lyrics) 

In [57]:
file = 'Lyrics_AAPRocky.json'
with open(file) as train_file:
    dict_rocky = json.load(train_file)
    
rocky_songs = dict_rocky.get('songs')

rocky_lyrics = []
for i in range(0,len(rocky_songs)):
    dicts = rocky_songs[i]
    song_lyric = dicts.get('lyrics')
    rocky_lyrics.append(song_lyric)
    
rocky = pd.DataFrame(rocky_lyrics) 

In [58]:
file = 'Lyrics_EazyE.json'
with open(file) as train_file:
    dict_eazy = json.load(train_file)
    
eazy_songs = dict_eazy.get('songs')

eazy_lyrics = []
for i in range(0,len(eazy_songs)):
    dicts = eazy_songs[i]
    song_lyric = dicts.get('lyrics')
    eazy_lyrics.append(song_lyric)
    
eazy = pd.DataFrame(eazy_lyrics) 

In [59]:
file = 'Lyrics_TechN9ne.json'
with open(file) as train_file:
    dict_tech = json.load(train_file)
    
tech_songs = dict_tech.get('songs')

tech_lyrics = []
for i in range(0,len(tech_songs)):
    dicts = tech_songs[i]
    song_lyric = dicts.get('lyrics')
    tech_lyrics.append(song_lyric)
    
tech = pd.DataFrame(tech_lyrics) 

In [60]:
file = 'Lyrics_OutKast.json'
with open(file) as train_file:
    dict_outkast = json.load(train_file)
    
outkast_songs = dict_outkast.get('songs')

outkast_lyrics = []
for i in range(0,len(outkast_songs)):
    dicts = outkast_songs[i]
    song_lyric = dicts.get('lyrics')
    outkast_lyrics.append(song_lyric)
    
outkast = pd.DataFrame(outkast_lyrics) 

In [61]:
file = 'Lyrics_ChancetheRapper.json'
with open(file) as train_file:
    dict_chance = json.load(train_file)
    
chance_songs = dict_chance.get('songs')

chance_lyrics = []
for i in range(0,len(chance_songs)):
    dicts = chance_songs[i]
    song_lyric = dicts.get('lyrics')
    chance_lyrics.append(song_lyric)
    
chance = pd.DataFrame(chance_lyrics) 

In [62]:
file = 'Lyrics_IceCube.json'
with open(file) as train_file:
    dict_cube = json.load(train_file)
    
cube_songs = dict_cube.get('songs')

cube_lyrics = []
for i in range(0,len(cube_songs)):
    dicts = cube_songs[i]
    song_lyric = dicts.get('lyrics')
    cube_lyrics.append(song_lyric)
    
cube = pd.DataFrame(cube_lyrics) 

In [63]:
file = 'Lyrics_MachineGunKelly.json'
with open(file) as train_file:
    dict_kelly = json.load(train_file)
    
kelly_songs = dict_kelly.get('songs')

kelly_lyrics = []
for i in range(0,len(kelly_songs)):
    dicts = kelly_songs[i]
    song_lyric = dicts.get('lyrics')
    kelly_lyrics.append(song_lyric)
    
kelly = pd.DataFrame(kelly_lyrics) 

In [64]:
file = 'Lyrics_Logic.json'
with open(file) as train_file:
    dict_logic = json.load(train_file)
    
logic_songs = dict_logic.get('songs')

logic_lyrics = []
for i in range(0,len(logic_songs)):
    dicts = logic_songs[i]
    song_lyric = dicts.get('lyrics')
    logic_lyrics.append(song_lyric)
    
logic = pd.DataFrame(logic_lyrics) 

In [65]:
total_lyrics = pd.concat([kendrick, kanye, pac, jay, wayne, snoop, fifty, dre, biggy, wiz, logic, kelly, cube, chance, outkast, tech, eazy, rocky])
total_lyrics = total_lyrics.replace(to_replace ='[\(\[].*?[\)\]]', value = ' ', regex = True) #replacing all things like '[Intro]'
total_lyrics = total_lyrics[total_lyrics[0].notna()] #dropping na rows

In [66]:
total_lyrics

Unnamed: 0,0
0,\nNobody pray for me\nIt's been that day for ...
1,\n\n \nIf Pirus and Crips all got along\nThey...
2,"\n\n \nPour up , head shot \nSit down , st..."
3,"\nI got, I got, I got, I got—\nLoyalty, got r..."
4,"\n\n \nUh, me and my niggas tryna get it, ya ..."
...,...
200,\n Lyrics for this song h...
201,\n Lyrics for this song h...
202,\n Lyrics for this song h...
203,\n Lyrics for this song h...


In [67]:
total_lyrics = total_lyrics.replace(to_replace = '\n', value = ' ', regex = True)
total_lyrics[0] = total_lyrics[0].str.replace('[^\w\s]','')
total_lyrics[0] = total_lyrics[0].str.lower()
#removed punctuation and newline and made lowercase

In [68]:
total_lyrics

Unnamed: 0,0
0,nobody pray for me its been that day for me ...
1,if pirus and crips all got along theyd pr...
2,pour up head shot sit down stand up...
3,i got i got i got i got loyalty got royalty ...
4,uh me and my niggas tryna get it ya bish ...
...,...
200,lyrics for this song ha...
201,lyrics for this song ha...
202,lyrics for this song ha...
203,lyrics for this song ha...


In [69]:
final_lyrics = total_lyrics[0].to_list()
# creating tokens from all the new data
tokens2 = []
for i in range(len(final_lyrics)):
    tokens = nltk.word_tokenize(final_lyrics[i])
    tokens2.append(tokens)

In [70]:
len(tokens2)

3418

In [71]:
new_tokens = []
for sublist in tokens2:
    for token in sublist:
        new_tokens.append(token)

In [72]:
len(new_tokens)

1878309

In [73]:
all_tokens = new_tokens + tokens_train
len(all_tokens)
# adding tokens from eminem to have the final number

2090181

# Models

I don't think that I need to check bigram models, because so far they haven't shown good results. I'll add a sixgram model instead, to see how the perplexity changes.

## Sixgrams

In [74]:
padded_Line6 = [list(pad_both_ends(all_tokens, n=6))]
train26, vocab26 = padded_everygram_pipeline(6, padded_Line6)

In [75]:
lyrics_sixmod1 = KneserNeyInterpolated(6, discount = 0.99) 
lyrics_sixmod1.fit(train26, vocab26)

In [76]:
test_sixgrams = ngrams(tokens_test, 6)

In [77]:
lyrics_sixmod1.perplexity(test_sixgrams)

563.0705670276024

## Fivegrams

In [78]:
padded_Line5 = [list(pad_both_ends(all_tokens, n=5))]
train25, vocab25 = padded_everygram_pipeline(5, padded_Line5)

In [79]:
lyrics_fivemod1 = KneserNeyInterpolated(5, discount = 0.99) 
lyrics_fivemod1.fit(train25, vocab25)

In [80]:
test_fivegrams = ngrams(tokens_test, 5)

In [None]:
lyrics_fivemod1.logscore('Frederic', 'loves to eat good food'.split())

In [81]:
lyrics_fivemod1.perplexity(test_fivegrams)

563.6897657510704

## Fourgrams

In [82]:
padded_Line4 = [list(pad_both_ends(all_tokens, n=4))]
train24, vocab24 = padded_everygram_pipeline(4, padded_Line4)

In [83]:
lyrics_fourmod1 = KneserNeyInterpolated(4, discount = 0.99) 
lyrics_fourmod1.fit(train24, vocab24)

In [84]:
test_fourgrams = ngrams(tokens_test, 4)

In [85]:
lyrics_fourmod1.perplexity(test_fourgrams)

563.4352503769684

## Trigrams

In [86]:
padded_Line3 = [list(pad_both_ends(all_tokens, n=3))]
train23, vocab23 = padded_everygram_pipeline(3, padded_Line3)

In [87]:
lyrics_trimod1 = KneserNeyInterpolated(3, discount = 0.99) 
lyrics_trimod1.fit(train23, vocab23)

In [88]:
test_trigrams = ngrams(tokens_test, 3)

In [89]:
lyrics_trimod1.perplexity(test_trigrams)

568.2308393118316

So, another 3k songs only made perplexity 300 smaller, so assuming that the math stays the same, I'll need another 5k to make it 0, and I think this is a very optimistic assumption on my part.

I will just go on working with the models that I have now. For text generating, I'll choose 4- and 6-gram models, because they have the best results among the ones presented. 

## Generating texts

In [90]:
detokenize = TreebankWordDetokenizer().detokenize

def generate_sent(model, num_words, random_seed):
    content = []
    period = '.' # to finish the sentences and make working with them later on easier
    words = 0
    while words < num_words:
        for token in model.generate(num_words, random_seed=random_seed):
            if token == '<s>':
                continue
            if token == '</s>':
                content.append(period)
                break
            content.append(token)
        words += 1
    return detokenize(content)

In [91]:
def generate_text(model, num_sents): #, random_seed #generate text with the sentences from above
    text = []
    sentences = 0
    while sentences < num_sents:
        for i in range(random.randint(0, 1000)):
            sentence = generate_sent(model, num_words = 20, random_seed = i)
        sentences += 1
        text.append(sentence)
    return ' '.join(text)

In [92]:
generate_text(lyrics_fourmod1, 20)

KeyboardInterrupt: 

I tried to generate text using the functions from above, but it was taking too long to generate even 20 sentences (think about a day long, and then at 10-hour mark I accidentally interrupted kernel, so the decision was taken out of my hands, I guess), so I'll use someone else's program to generate song. 

My biggest thanks to Pratap Vardhan at kaggle.com for this program. 
https://www.kaggle.com/pratapvardhan/kanye-lyrics-eda-song-generator-topic-modelling

In [103]:
total_total = pd.concat([total_lyrics, eminem])

In [104]:
total_total

Unnamed: 0,0
0,nobody pray for me its been that day for me ...
1,if pirus and crips all got along theyd pr...
2,pour up head shot sit down stand up...
3,i got i got i got i got loyalty got royalty ...
4,uh me and my niggas tryna get it ya bish ...
...,...
295,hi my name is what my name is who my name is c...
296,are we supposed to shut up or talkill cut your...
297,kick a hypest lyric every time i run up on a t...
298,come on put me in my place come on put me in m...


In [93]:
# Machine generated lyrics using Markov
import re
import random
from collections import defaultdict


class MarkovRachaita:
    def __init__(self, corpus='', order=2, length=8):
        self.order = order
        self.length = length
        self.words = re.findall("[a-z']+", corpus.lower())
        self.states = defaultdict(list)

        for i in range(len(self.words) - self.order):
            self.states[tuple(self.words[i:i + self.order])].append(self.words[i + order])

    def gen_sentence(self, length=8, startswith=None):
        terms = None
        if startswith:
            start_seed = [x for x in self.states.keys() if startswith in x]
            if start_seed:
                terms = list(start_seed[0])
        if terms is None:
            start_seed = random.randint(0, len(self.words) - self.order)
            terms = self.words[start_seed:start_seed + self.order]

        for _ in range(length):
            terms.append(random.choice(self.states[tuple(terms[-self.order:])]))

        return ' '.join(terms)

    def gen_song(self, lines=10, length=8, length_range=None, startswith=None):
        song = []
        if startswith:
            song.append(self.gen_sentence(length=length, startswith=startswith))
            lines -= 1
        for _ in range(lines):
            sent_len = random.randint(*length_range) if length_range else length
            song.append(self.gen_sentence(length=sent_len))
        return '\n'.join(song)

In [113]:
rap = MarkovRachaita(corpus=' '.join(total_total[0]))
rap_hit = rap.gen_song(lines=20, length_range=[5, 15])

In [114]:
rap_hit = rap_hit.split('\n')

In [120]:
for line in rap_hit:
    print(line)

is amazing barely scratching the surface they come
soft porn before it fades into black and yellow
jodye tell these hoes alls we know we need a one girls
me do this you go one more
got greed khop check this bitches aint shit in the same thang thang
gunit i murk you my nigga come who crazy hot im snoop dogg im bought up and
some bad bitches thats my trippy kit yea weed pills and that cow
knowing junkies on the mic you see im a stain on
on like a sony while the demons let em know im roguish surrounded by
my job here isnt done cause they dont understand me but youre still a rider nigga
you are now blown all over our necks like were supposed to be
im not finna fold uh if you sound frantic i hear the voice the west side
day jumpin the gun to the greatness or die
in your life yet every time we had to do what i want to say we aint
spell running free even though you the middle
gang tight and the pentecostal why yall even
just felt like you just act hollywood like they
got an ashtray where did 

I don't think that this makes any sense, but beggars cant be choosers, and I didn't use any of my awesome Ngrams, so that's a shame. 

Suggestions for improvement:
1. try better models next time
2. make sure not to interrupt anything

Sources:
1. https://www.kaggle.com/alvations/n-gram-language-model-with-nltk
2. https://github.com/nltk/nltk/pull/2363/commits/ce74e449dc9526e19596b1c4a9c510bbb35812cc
3. https://www.kaggle.com/pratapvardhan/kanye-lyrics-eda-song-generator-topic-modelling