In [23]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

In [24]:
path = "./lyrics/"

def read_all(path):
    txt = []
    for _, _, files in os.walk(path):
        for file in files:
            with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
                in_lyric = False
                for line in f:
                    line = line.strip()
                    if line == '----------':
                        if in_lyric:
                            in_lyric = False
                            txt.append("")
                    elif line:
                        if line.isupper() and line.isalpha():
                            in_lyric = True 
                        if in_lyric:
                            txt.append(line) 
    return txt
lyrics = read_all(path)
print("number of lines = ", len(lyrics))


number of lines =  30520


In [25]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned = clean_txt(lyrics)
print("number of words = ", len(cleaned))

number of words =  198945


In [26]:
print(cleaned[:5])

['i', 'you', 'but', 'i', 'told']


In [27]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [28]:
markov_model = make_markov_model(cleaned)

In [29]:
print("number of states = ", len(markov_model.keys()))

number of states =  75794


In [30]:
print("All possible transitions from 'your love' state: \n")
print(markov_model['your love'])

All possible transitions from 'your love' state: 

{'how i': 0.04878048780487805, 'when i': 0.024390243902439025, 'is fadin': 0.1951219512195122, 'is fadinknow': 0.024390243902439025, 'is fadinhook': 0.024390243902439025, 'shake and': 0.04878048780487805, 'is magical': 0.024390243902439025, 'in my': 0.04878048780487805, 'and you': 0.024390243902439025, 'always give': 0.024390243902439025, 'i need': 0.024390243902439025, 'i walked': 0.04878048780487805, 'at night': 0.024390243902439025, 'all along': 0.024390243902439025, 'i only': 0.024390243902439025, 'is good': 0.024390243902439025, 'i took': 0.024390243902439025, 'down didnt': 0.024390243902439025, 'down no': 0.024390243902439025, 'down u': 0.04878048780487805, 'down there': 0.12195121951219512, 'seat dont': 0.024390243902439025, 'the arms': 0.024390243902439025, 'tonight well': 0.024390243902439025, 'in the': 0.024390243902439025}


In [31]:
def generate_story(markov_model, limit=100, start='your love'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [32]:
for i in range(1,9):
    print("-> ", generate_story(markov_model, start="can you", limit=8))

->  can you relate my girls gone and she answered no to me whatever you want get out your 
->  can you stay lil mama i dont play lil mama what chu say lil mama can you stay 
->  can you stay lil mama i feel so bad so bad sometimes i wish you would take my 
->  can you can you can hear the blood is real let democracy take u higher everybody wants find 
->  can you just leave it all over with one peppercorn sing holly go whistle and ivy i harrowed 
->  can you can you catch a cab to manhattan with that broadway actin you hype that belly shitd 
->  can you bring your price down lil boosie with the stars in it new porsche condo living room 
->  can you just leave me standing there i would not she went to the devil verse insurance where 


In [33]:
for i in range(1,9):
    print("-> ", generate_story(markov_model, start="baby i", limit=8))

->  baby i would show you guide you to call mine girl i got a lot of figures im 
->  baby i love u in new york they wont say that ure better off being together hold my 
->  baby i had to run the outcome is usually a beatdown brutally fuck who you be or where 
->  baby i brainstorm call me flood head married to the rover to the project b and c lex 
->  baby i love u this thing we got the soul the steps you take nixga i was toting 
->  baby i wan na play get funky she the new kings of the world is yours we the 
->  baby i love u i love her somethin vicious and im just saying im in one blink i 
->  baby i would do a kingdoms henceforth a dedicated sort a member of the war where my enemy 
