In [1]:
import numpy as np
import pandas as pd

We're going to be using the Shakespeare data set. I think the old Shakespearean style of English would create interesting auto generated sentences.

In [2]:
df_text = pd.read_csv('Shakespeare_data.csv')
df_text = df_text.sample(50000)
df_text.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
8974,8975,Henry VI Part 2,7.0,4.8.49,CLIFFORD,"To France, to France, and get what you have lost,"
82192,82193,Richard III,4.0,1.4.16,CLARENCE,That had befall'n us. As we paced along
2094,2095,Henry IV,28.0,3.2.259,Hostess,"He? alas, he is poor, he hath nothing."
77684,77685,Pericles,14.0,4.3.102,GOWER,Hath Thetis' birth-child on the heavens bestow'd:
26200,26201,Coriolanus,5.0,3.1.10,CORIOLANUS,Saw you Aufidius?


The Shakespeare data set has a lot of useless columns. Basically we'll only only need the PlayerLine column seen above that actually contains the text we need. We can get rid of the rest.

In [3]:
df_text.drop(['Dataline','Play','PlayerLinenumber','ActSceneLine', 'Player'], inplace=True,axis=1)
df_text.head(10)

Unnamed: 0,PlayerLine
8974,"To France, to France, and get what you have lost,"
82192,That had befall'n us. As we paced along
2094,"He? alas, he is poor, he hath nothing."
77684,Hath Thetis' birth-child on the heavens bestow'd:
26200,Saw you Aufidius?
45700,Rests by you only to be qualified:
97966,"O, O, O,"
63359,"To have it baned? What, are you answer'd yet?"
105668,take it for your pains.
60448,"as Claudio's, to cross this in the smallest."


Rename the PlayerLine column to text just for clarity

In [4]:
df_text.rename(index=str, columns={"PlayerLine": "text"}, inplace=True)
df_text.head(10)

Unnamed: 0,text
8974,"To France, to France, and get what you have lost,"
82192,That had befall'n us. As we paced along
2094,"He? alas, he is poor, he hath nothing."
77684,Hath Thetis' birth-child on the heavens bestow'd:
26200,Saw you Aufidius?
45700,Rests by you only to be qualified:
97966,"O, O, O,"
63359,"To have it baned? What, are you answer'd yet?"
105668,take it for your pains.
60448,"as Claudio's, to cross this in the smallest."


We need to obtain an array of first words of each line and an array of all words that are used

In [5]:
firstWords = []
wordArray = []
lines = df_text['text']
for line in lines:
    firstWords.append(line.split()[0])
    for word in line.split():
        wordArray.append(word)

In [6]:
uniqueWords = list(set(wordArray))
uniqueWords.sort()
uniqueWordCount = len(uniqueWords)

In [7]:
print("Number of words in all lines:", len(wordArray))
print("Number of unique words used:", uniqueWordCount)

Number of words in all lines: 364796
Number of unique words used: 39126


I limited the sample of the data set to 50000 lines due to memory constraints on my personal computer. That sample yielded about ~370k words using ~40k unique ones, so not a bad data set to work with.

In [8]:
words = dict.fromkeys(uniqueWords)
index = 0
for word in words:
    words[word]=index
    index += 1

Index each word and throw it into a dictionary

In [9]:
print(words)



In [10]:
transitionCount = np.zeros((uniqueWordCount+1, uniqueWordCount+1))
transition2Count = np.zeros((uniqueWordCount+1, uniqueWordCount+1))

Initialize the transition count arrays to zero. The arrays are the dimensions of the unique words plus one in each dimension to account for the null state which is the end of the sentence

In [11]:
for line in lines:
    sentence = line.split()
    for i in range(len(sentence)):
        if i < len(sentence) - 1:
            transitionCount[words[sentence[i]]][words[sentence[i+1]]] += 1
        else:
            transitionCount[words[sentence[i]]][uniqueWordCount] += 1

        if i < len(sentence) - 2:
            transition2Count[words[sentence[i]]][words[sentence[i+2]]] += 1
        else:
            transition2Count[words[sentence[i]]][uniqueWordCount] += 1
transitionCount[uniqueWordCount][uniqueWordCount] = 1
transition2Count[uniqueWordCount][uniqueWordCount] = 1

In [12]:
print(transitionCount, transition2Count)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]] [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [13]:
transitionNorm = transitionCount
transition2Norm = transition2Count
for i in range(len(transitionCount)):
    transitionNorm[i] /= transitionNorm[i].sum()
    transition2Norm[i] /= transition2Norm[i].sum()

In [14]:
print(transitionNorm, transition2Norm)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]] [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [15]:
uniqueWords.append(None)

In [16]:
def generateSentence(seed=np.random.choice(firstWords, size=1)[0], targetLength=7, generatedSentence=[]):
    generatedSentence.append(seed)
    
    nextWord = np.random.choice(uniqueWords, size=1,p=transitionNorm[words[generatedSentence[-1]]])[0]
    if targetLength > 1:
        while(nextWord is None):
            nextWord = np.random.choice(uniqueWords, size=1,p=transitionNorm[words[generatedSentence[-1]]])[0]

    while nextWord is not None:
        generatedSentence.append(nextWord)
        nextProbabilitys = transitionNorm[words[generatedSentence[-1]]] * (transition2Norm[words[generatedSentence[-2]]]) + transitionNorm[words[generatedSentence[-1]]]/4
        nextProbabilitys[-1] += 0.00001
        nextProbabilitys /= nextProbabilitys.sum()
        if len(generatedSentence) < targetLength - 1:
            if nextProbabilitys.sum() > nextProbabilitys[-1]:
                nextProbabilitys[-1] /= 10
            nextProbabilitys /= nextProbabilitys.sum()
        if len(generatedSentence) > targetLength + 1:
            nextProbabilitys[-1] *= 2
            nextProbabilitys /= nextProbabilitys.sum()
        nextWord = np.random.choice(uniqueWords, size=1,p=nextProbabilitys)[0]
        
    return generatedSentence

In [17]:
generatedSentence = generateSentence()
print(' '.join(str(x) for x in generatedSentence if x is not None))

I would not once being over-proud


It seems to work, obviously Shakespearean English makes for weird sentences

In [18]:
def emissionProbability(probability, state):
    for i in range(len(probability)-1):
        if state is 'S':
            if len(uniqueWords[i]) < 5:
                probability[i] *= 2
            else:
                probability[i] /= 2
        else:
            if len(uniqueWords[i]) > 4:
                probability[i] *= 2
            else:
                probability[i] /= 2
    probability[i] /= probability.sum()
    
    
hiddenStateDict = {'S': 0, 'L': 1}
hiddenStates = ['S','L']
hiddenStateTransitionMatrix = [[.7, .3],[.8, .2]]
def nextHiddenState(hiddenState):
    nextHiddenState = np.random.choice(hiddenStates, size=1, p=hiddenStateTransitionMatrix[hiddenStateDict[hiddenState]])
    return nextHiddenState[0]

The generateSentence function uses a normal Markov model. To adapt this function to be use a Hidden Markov Model we must add a hidden state (generateSentenceHidden). The hidden state here will be based on the length of the word. If the word is 4 letters or longer it is assigned the L (long) state, otherwise it assigned S (short) state. A hidden transition state based on the length of a word is not ideal because it's an abitrary characteristic to build sentences but it will be good enough to generate a sentence that is coherent enough. Ideally a hidden state would include grammar and syntax based characteristics to build a sentence that makes sense but also has real meaning.

In [19]:
def generateSentenceHidden(seed=None, targetLength=7, generatedSentence=[], hiddenState = 'S'):
    if seed is None:
        seed=np.random.choice(firstWords, size=1)[0]
    if generatedSentence == []:
        generatedSentence.append(seed)   
    nextWord = np.random.choice(uniqueWords, size=1,p=transitionNorm[words[generatedSentence[-1]]])[0]
    if targetLength > 1:
        while(nextWord is None):
            nextWord = np.random.choice(uniqueWords, size=1,p=transitionNorm[words[generatedSentence[-1]]])[0]

    while nextWord is not None:
        generatedSentence.append(nextWord)
        nextProbabilitys = transitionNorm[words[generatedSentence[-1]]] * (transition2Norm[words[generatedSentence[-2]]]) + transitionNorm[words[generatedSentence[-1]]]/4
        nextProbability = emissionProbability(nextProbabilitys, hiddenState)
        nextProbabilitys[-1] += 0.00001
        nextProbabilitys /= nextProbabilitys.sum()
        if len(generatedSentence) < targetLength - 1:
            if nextProbabilitys.sum() > nextProbabilitys[-1]:
                nextProbabilitys[-1] /= 10
            nextProbabilitys /= nextProbabilitys.sum()
        if len(generatedSentence) > targetLength + 1:
            nextProbabilitys[-1] *= 2
            nextProbabilitys /= nextProbabilitys.sum()
        nextWord = np.random.choice(uniqueWords, size=1,p=nextProbabilitys)[0]
        
        hiddenState = nextHiddenState(hiddenState)
    return generatedSentence
generatedSentence = generateSentenceHidden()
print(' '.join(str(x) for x in generatedSentence if x is not None))

Millions of all welcome. Make the


In [20]:
sentence = "to be or not to be "
generatedSentence = generateSentenceHidden(seed=sentence.split()[-1], generatedSentence=sentence.split())
print(' '.join(str(x) for x in generatedSentence if x is not None))

to be or not to be a most pitiful rumour shake hands.


This was done with the help of and inspiration from Andre Kurait, credit: https://github.com/surabhikhachar/EECS_738_Project_2