In [5]:
from collections import defaultdict, Counter
import numpy as np
import random
import bisect
import re

In [6]:
class character_model:
    def __init__(self, filename="characterInteractions.txt", numberLines=20):
        self.characters = []
        self.content = []
        self.numberLines = numberLines
        self.create_character_model(filename)
        self.generate_character_order()
    
    def create_character_model(self, filename):  
        #READ IN DATA AND CREATE TWO LISTS: CHARACTERS AND THEIR CHANCE OF BEING SELECTED
        with open(filename) as f:
            char_probs = {}
            for line in f:
                character, followers = line.split("\t")
                followers = followers.strip().split(" ")
                charList, probList = [], []
                for follow in followers:
                    char, occurance = follow.split(";")
                    charList.append(char)
                    probList.append(int(occurance) )
                total = sum(probList)
                probList = [round(p / total, 4) for p in probList]
                char_probs[character] = (charList, probList)
                
        #SAVE ALL THE CHARACTERS
        self.characters = list( char_probs.keys() )

        #FINALLY, GENERATE A WEIGHTED SAMPLER FROM THE CHAR LIST AND PROB LIST FOR EACH CHARACTER
        character_model = {}
        for char, lists in char_probs.items():
            character_model[char] = self.weighted_sampler(lists[0], lists[1])
        self.character_model = character_model
        
    def weighted_sampler(self, charList, probList):
        totals = []
        for p in probList:
            totals.append(p + totals[-1] if totals else p)
        return lambda: charList[bisect.bisect(totals, random.uniform(0, totals[-1]))]

    #SELECT A CHARACTER TO FOLLOW THE GIVEN CHARACTER X
    def next_character(self, x):
        return self.character_model[x]()
    
    def generate_character_order(self):
        #RANDOMLY CHOOSE FIRST CHARACTER 
        first = random.choice(self.characters)
        order = [first]
        
        for i in range(self.numberLines - 1):
            order.append(self.next_character(order[i]))
        self.ordering = order
    
    def get_ordering(self):
        return self.ordering

In [7]:
def generateScript(interactionCount, ngram, maxTokens=25, greedy=True, topk=False, k=1):
    ch = character_model(numberLines=interactionCount)
    scriptOrder = ch.get_ordering()
    sentence_models = {}
    for character in scriptOrder:
        if character not in sentence_models:
            sentence_models[character] = sentence_model(character, ngram=ngram)
        print(character.upper() + " -- " )
        print( "\t" + sentence_models[character].create_sentence(maxTokens=maxTokens, greedy=greedy, topk=topk,k=k))

In [32]:
generateScript(10, 2, 20, greedy=False, topk=False)

BARNACLE BOY -- 
	 wake instantly invisiboat contains wrinkles in term mayonnaise running got watch.
SPONGEBOB -- 
	 no matter hours muddy 12th useless 58 spruce mobile idea barbecues chip caterers shrinking chased faith stew blblblblblblblblblblblbl r… firefighting
KAREN -- 
	 my science hes navery catching aw plank keep monitor care baby weight ♪observant whereve piece babys krabbypattyformulaloadingcompleted constant kidnapping door
PLANKTON -- 
	 copy whoa dime center aim blame fallen antennae— slime noticed deepest started fine krabs frigid does borrow masterminds switcher than
ACTION -- 
	 the magazine plastic pointed turns footbag rocking philharmonic it apart noises boxs sticks troop worms caption flops thawed cymbal abandoned
SPONGEBOB -- 
	 living laughter prices identify irony alarmed onion freeway frowning drove ends]</nowiki> settle paychecks vat cephalop eyebrows mommies wouldve fantabulous consumed
ACTION -- 
	 flying dutchman activates throbbing deliver entrances blow s

In [18]:
generateScript(10, 4, 14, greedy=False, topk=False)

FRENCH NARRATOR -- 
	 ah notice the scavenger crab eating garbage unaware he is being stalked by his natural predator
ACTION -- 
	 mrspuffs inflated body causes a pileup in the middle of the road knocking over two other
SPONGEBOB -- 
	 id put my pile of dog food up against your stud any day.
SQUIDWARD -- 
	 they got pictures of me naked.
PATRICK -- 
	 yeah lets have another.
SPONGEBOB -- 
	 so whatd i miss.
PATRICK -- 
	 were bored.
SPONGEBOB -- 
	 hi mailfish.
SQUIDWARD -- 
	 oh opposite day.
SPONGEBOB -- 
	 go spongebob.


In [159]:
generateScript(10, 3, 15, greedy=False)

FRED -- 
	 ghosts dang bodys nervous somewhere wow this fin its wheelbarrow radicals stinks toy what dying ugh
SPONGEBOB -- 
	 fourth automobiling offices candles stony facilities allowing fires pole should diligent chatterbox captains mild nyah♪ surpassed
SANDY -- 
	 munchie carefully functions collar push whatever element ho failure slip twelfths capsized agility chemistry seats rascal
SPONGEBOB -- 
	 lawyer flu intimidated sunrise 35 throated powder ringing landing sasspants ole theyll pile utterly hike easter
PATRICK -- 
	 garys chunks due bustle congratulations across invented flooring bottoms rib clippers trap connected tummy towed process
SPONGEBOB&PATRICK -- 
	 pin plan sideburns hmm buns while colors fields be self want am more said fast krab
SPONGEBOB -- 
	 used automatic behavior cleats dokie bicycles blues damaged properly hop december replacement compactor colonel uglier slam
PATRICK -- 
	 meow asked peek can—ow gonna grab gone stopping aye swirl breathe garde adorable fit

In [81]:
generateScript(5, 12, 10)

CUSTOMER -- 
73
	 i was born with glass bones and paper skin.
ACTION -- 
69246
	 meanwhile the girls made it to the cabin they rented for their camping trip.
PATRICK -- 
5061
	 oh jeez spongebob who cares if everyone knows about our secret hideout.
SANDY -- 
3578
	 youre trespassing spongebob in my lab in my submarine and youre trespassing inside of squidward.
SPONGEBOB -- 
24324
	 without the krusty krab i i i oh squidward could you hold me.


In [35]:
generateScript(10,4,15)

FLYING DUTCHMAN -- 
	 ah but you were cheap and being cheap is a terrible thing.
ACTION -- 
	 spongebob panics and struggles to pull squidward off.
PATRICK -- 
	 what could it be.
SPONGEBOB -- 
	 all im saying is maybe theres a reason glove world.
PATRICK -- 
	 krabs doesnt need you anymore.
SQUIDWARD -- 
	 um whatcha up to.
SPONGEBOB -- 
	 and drop your money anywhere.
GARY -- 
	 psst. his his his his his his his his his his his his his his his
SPONGEBOB -- 
	 my license.
ACTION -- 
	 several kids are watching the animatronic animal stage show.


In [30]:
generateScript(10,4,15)

NAT -- 
	 no extra.
SPONGEBOB -- 
	 handsome. won’t… won’t… won’t… won’t… won’t… won’t… won’t… won’t… won’t… won’t… won’t… won’t… won’t… won’t… won’t…
ACTION -- 
	 the spiders claw makes the record sound terrible.
SPONGEBOB -- 
	 cactuses protected by barbed wire.
LARRY -- 
	 thanks i work out.
SPONGEBOB -- 
	 uhyou could be sick.
ACTION -- 
	 the iron ball and rolls plankton over to the krusty krab.
PATRICK -- 
	 right. oooh oooh oooh oooh oooh oooh oooh oooh oooh oooh oooh oooh oooh oooh oooh
SPONGEBOB -- 
	 the sink is clogged up.
PATRICK -- 
	 youre coin operated arent you.


In [6]:
generateScript(10,3,15)

MERMAID MAN -- 
	 a double krabby patty.
ACTION -- 
	 clams in the kitchen.
SPONGEBOB -- 
	 okay.
PATRICK -- 
	 leedle leedle lee.
SPONGEBOB -- 
	 pepper.
SQUIDWARD -- 
	 shift into reverse spongebob.
SPONGEBOB -- 
	 steak sauce.
PATRICK -- 
	 u r huh how do you think.
SPONGEBOB -- 
	 yer spending all me money.
SQUIDWARD -- 
	 drama queen.


In [92]:
generateScript(10,7,15)

SPONGEBOB&PATRICK -- 
	 patty cake patty cake sailor man.
ACTION -- 
	 the screen turns black. beard beard beard beard beard beard beard beard beard beard beard beard beard beard beard
CUSTOMER -- 
	 hey i just realized being outside is awful.
ACTION -- 
	 suddenly a sand tornado appears out of nowhere.
SQUIDWARD -- 
	 i have i have. insist insist insist insist insist insist insist insist insist insist insist insist insist insist insist
PATRICK -- 
	 ive gotta sneak this back into spongebobs house before he hello.
ACTION -- 
	 she makes a left turn from the wall and turns where the safety cones are.
SQUIDWARD -- 
	 well then where am i supposed to live.
SPONGEBOB -- 
	 ill go see if there are any jellyfish in that creepy scary evil looking cave over there.
ACTION -- 
	 the crowd is cheering they both fly into the ropes because of the force of mr krabs yell and bounce


In [77]:
generateScript(10,5,15)

CUSTOMERS -- 
	 monster krabby patty.
ACTION -- 
	 all talking at once.
SQUIDWARD -- 
	 lucky for you i was here today.
ACTION -- 
	 the drone holds out a laser and zaps the hand away opening the roof in the process.
SPONGEBOB -- 
	 never why. insist insist insist insist insist insist insist insist insist insist insist insist insist insist insist
PATRICK -- 
	 let it be known to all far and wide the mollusks are coming.
SPONGEBOB -- 
	 alright patrick you leave me no choice.
PATRICK -- 
	 we are in a giggle zone.
SPONGEBOB -- 
	 i am bad.
SANDY -- 
	 maybe you could help me fix my nut transporter.


In [73]:
generateScript(15, 7, 15)

MERMAID MAN -- 
	 besides man ray had to buy the neck.
ACTION -- 
	 sandy gives spongebob and patrick two jars full of her nutty butter and they leave.
SPONGEBOB -- 
	 itll make your hair grow.
SANDY -- 
	 more water for the sea critter.
SPONGEBOB -- 
	 oh i didnt have to be a fool to get sandys attention.
PATRICK -- 
	 oooh im the flying dutchmans best friend.
SPONGEBOB -- 
	 hey you want me to tell you the secret of how to win on this machine.
SQUIDWARD -- 
	 i wouldnt let that thing into my house even if it was potty trained.
PLANKTON -- 
	 sorry i really have nothing for you.
ACTION -- 
	 the customers prepare to climb back down and take shelter from the storm.
KAREN -- 
	 did you see the pretty laser honey.
PLANKTON -- 
	 why wont you wake up.
ACTION -- 
	 he climbs into bed and goes to sleep.
PATCHY -- 
	 get—what are you doing here.
ACTION -- 
	 sandy kicks the door open.


In [70]:
generateScript(15, 7, 15)

GARY -- 
	 there once was a man from peru who dreamed he was eating his shoe.
SPONGEBOB -- 
	 get me the navy. insist insist insist insist insist insist insist insist insist insist insist insist insist insist insist
ACTION -- 
	 patrick giggles and falls off of sandys helmet.
PATRICK -- 
	 wait howd he get out of jail.
SANDY -- 
	 i liked you better when you were a barnacle head.
PEARL -- 
	 wait but who took that picture who would even take a picture of that.
ACTION -- 
	 the giant jellyfish monster starts to calm down and it seems to be working at first.
SPONGEBOB -- 
	 not after all that weve been through.
PATRICK -- 
	 please stand back for your own safety.
SPONGEBOB -- 
	 ive finally got my drivers license.
SPONGEBOB -- 
	 im back mr krabs. insist insist insist insist insist insist insist insist insist insist insist insist insist insist insist
SQUIDWARD -- 
	 spongebob i will give you $5 if you let me enjoy the rest of my morning in peace.
SPONGEBOB -- 
	 i must be the biggest jer

In [66]:
generateScript(15, 8, 15)

CUSTOMERS -- 
	 i dont know why but its bad.
SPONGEBOB -- 
	 i guess we dont air breather.
ACTION -- 
	 mini spongebob and mini patrick follow them outside.
SQUIDWARD -- 
	 uh this is a bad time isnt it.
SPONGEBOB -- 
	 i dont want you getting into trouble during the night so youre gonna stay in the aquarium like a good little
ACTION -- 
	 they also run over gary who is dizzy and retreats into his shell.
SPONGEBOB -- 
	 i just couldnt help myself. insist insist insist insist insist insist insist insist insist insist insist insist insist insist insist
PATRICK -- 
	 uh i wasnt being eaten. beard beard beard beard beard beard beard beard beard beard beard beard beard beard beard
PATRICK -- 
	 its gonna eat squidward and mr krabs.
SQUIDWARD -- 
	 oh boy nothing like a game of charades.
SPONGEBOB -- 
	 i can see why snail shells arent clear.
SQUIDWARD -- 
	 i dont care where im going just take me away from here.
PLANKTON -- 
	 hey why are you idiots so short.
SPONGEBOB -- 
	 i never realize

In [65]:
generateScript(15, 5, 15)

PEARL -- 
	 daddy im not hungry anymore and ive stopped growing.
SPONGEBOB -- 
	 youre safe now little lady.
ACTION -- 
	 cut to nighttime.
SQUIDWARD -- 
	 roast salad à la squidward.
ACTION -- 
	 purple olaf pulls back his hammer and gets ready to launch squidward spongebob takes a krabby patty and
PLANKTON -- 
	 two tablespoonsunderstanding. insist insist insist insist insist insist insist insist insist insist insist insist insist insist insist
ACTION -- 
	 but before it does mr krabs flicks plankton off the track sending him flying and crashing into a
SPONGEBOB -- 
	 i cant stopbiting mynails.
SANDY -- 
	 and good afternoon to you mr krabs.
SPONGEBOB -- 
	 then he wont be sad anymore.
ACTION -- 
	 spongebob tears off his blue robe showing off his extremely large muscular body.
PLANKTON -- 
	 cant you see im working on my new molecular analyzer.
ACTION -- 
	 he crawls up the side and into the porthole window.
SPONGEBOB -- 
	 remember bubble buddy.
PATRICK -- 
	 im just borrowing your

In [57]:
generateScript(15, 5, 10, minimumOccurCount=2)

FLYING DUTCHMAN -- 
	 alrighty boys partys over.
SPONGEBOB -- 
	 well nothing me and the sizzlemaster cant handle.
SQUIDWARD -- 
	 place your orders everybody.
ACTION -- 
	 spongebob feeds the baby snails with the bottles of milk.
PATRICK -- 
	 natural talent watch this… i found out where boogers come from hey check
SPONGEBOB -- 
	 a magical place with magical charms.
SANDY -- 
	 see i told ya.
SPONGEBOB -- 
	 no please. insist insist insist insist insist insist insist insist insist insist
SQUIDWARD -- 
	 dont wheres what squidward.
PATRICK -- 
	 thats just squidward sunbathing again.
ACTION -- 
	 mr krabs once again foils another one of planktons attempts to steal the
SPONGEBOB -- 
	 oh look the mourners have arrived.
SQUIDWARD -- 
	 the weather changed my plans.
SPONGEBOB -- 
	 mr krabs squidward is our friend and his paintings are so cheap you
SQUIDWARD -- 
	 gotta get that paper.


In [30]:
class sentence_model:
    def __init__(self, character, ngram=2, alpha=.1):
        self.ngram = ngram
        self.ngramCounts = [i for i in range(ngram - 1, ngram + 1)]
        self.filename = "./data/" + character.replace(" ", "").replace(".", "").replace("&", "") + ".txt"
        self.character = character
        self.alpha = alpha
        
        self.textContent = self.clean_input_document()
        self.textDict = list( set(word for line in self.textContent for word in line.split(" ") if word) )
        self.textDictSize = len(self.textDict)
        self.wordCounts = self.count_words(self.textContent)
        self.word2index = {w: i for i, w in enumerate(self.wordCounts[self.ngramCounts[0]].keys())}
        #self.probabilities = self.get_probability_matrix()
        
    def clean_input_document(self):
        if self.character == 'action':
            with open(self.filename) as f:
                contents = f.read().lower()
            translate = "?:!-\n"
            replace = "...  "
            delete = ",;_()\"\'[]"
            table = contents.maketrans(translate, replace, delete)
            contents = contents.translate(table).replace("mr. ", "mr ").replace("mrs. ", "mrs")
            text = contents.split(".")
            text = ["<s> " + t.strip() + " </s>" for t in text if t]
            return text
        
        with open(self.filename) as f:
            contents = f.read().lower()
        translate = "?:!-\n"
        replace = "...  "
        delete = ",;_()\""
        table = contents.maketrans(translate, replace, delete)
        contents = contents.translate(table).strip().replace("...", "").replace("mr. ", "mr ").replace("mrs. ", "mrs")
        contents = re.sub("''\[.*?\]''","", contents).replace("'", "")

        text = contents.split(".")
        text = ["<s> " + t.strip() + " </s>" for t in text if t]
        return text
            
    def count_words(self, contents):
        """Iterate through the contents and gather the counts of words"""
        wordCounts = {}
        for i in self.ngramCounts:
            if i == 0: # want the default to be the size of the corpus
                total = 0
                for line in contents:
                    words = line.split(" ")
                    words = [ w.strip() for w in words if w] #remove nulls
                    for word in words:
                        if word:
                            total += 1
                wordCounts[i] = defaultdict(lambda: total)
                continue
            else:
                counts = defaultdict(lambda: 0)
            for line in contents:
                words = line.split(" ")
                words = [ w.strip() for w in words if w] #remove nulls
                for k, word in enumerate(words): 
                    if k < (i-1) or not word:
                        continue
                    key = ""
                    for j in range(k-i+1, k+1):
                        key += words[j] + " "
                    counts[key.strip()] += 1
            wordCounts[i] = counts
        return wordCounts

    def model(self, x):
        return (self.wordCounts[self.ngramCounts[1]][x[0]] + self.alpha) / \
                    ( self.wordCounts[self.ngramCounts[0]][x[1]] + self.textDictSize*self.alpha )
    
    def model_old(self, x):
        return (self.wordCounts[self.ngramCounts[1]][x[0]] ) / \
                    ( self.wordCounts[self.ngramCounts[0]][x[1]]  )
    
    def get_probability_matrix(self):
 
        probabilities = []
        for wordA in self.wordCounts[self.ngramCounts[0]].keys():
            line = []
            for wordB in self.textDict:
                line.append(self.model([wordA + " " + wordB, wordA]))
            probabilities.append(line)
        return np.array(probabilities)
    
    def get_starting_word(self):
        potential = []
        for phrase in self.wordCounts[self.ngramCounts[0]].keys():
            if phrase[:3] == "<s>":
                potential.append(phrase)
        return random.choice(potential)

    def generate_sample_top_k(self, lm, index2word):
        """ Taken from - http://veredshwartz.blogspot.com/2019/08/text-generation.html
        Generates a string, sample a word from the top k probable words in the distribution at each time step.
        :param lm - the language model
        :param index2word - a mapping from the index of a word in the vocabulary to the word itself
        :param k - how many words to keep in the distribution """ 

        generated_sentence = self.get_starting_word()
        curr_token = None
        generated_tokens = 0

        while '</s>' not in generated_sentence and generated_tokens < self.maxTokens:
            #NEED TO CHOOSE A ROW ELEMENT -- LAST n-1 WORDS OF SENTENCE
            gen_list = generated_sentence.split()[-self.ngramCounts[0]:]
            gen_row = " ".join(gen_list)

            curr_distribution = lm(gen_row)  # vector of probabilities
            sorted_by_probability = np.argsort(curr_distribution) # sort by probability
            top_k = sorted_by_probability[-(self.k+1):] # keep the top k words
            
            selected_probs = [curr_distribution[t] for t in top_k]
            if selected_probs.count(selected_probs[0]) == len(selected_probs) : #all probabilities are indentical, randomly choose
                top_k = [np.random.choice(range(len(index2word)))]

            k_index2Word = [] #grab the top k words associated with top_k probabilities
            for index in top_k:
                k_index2Word.append(index2word[index])

            # normalize to make it a probability distribution again
            top_k = top_k / np.sum(top_k)

            selected_index = np.random.choice(range(len(k_index2Word)), p=top_k)
            curr_token = k_index2Word[int(selected_index)]
            generated_sentence += ' ' + curr_token   
            generated_tokens += 1

        return generated_sentence


    def generate_sample(self, lm, index2word):
        """ Taken from http://veredshwartz.blogspot.com/2019/08/text-generation.html
        Generates a string, sample a word from the distribution at each time step.
        :param lm - the language model
        :param index2word - a mapping from the index of a word in the vocabulary to the word itself """ 
        
        generated_sentence = self.get_starting_word()
        generated_tokens = 0
        curr_token = None

        while '</s>' not in generated_sentence and generated_tokens < self.maxTokens:
            #NEED TO CHOOSE A ROW ELEMENT -- LAST n-1 WORDS OF SENTENCE
            gen_list = generated_sentence.split()[-self.ngramCounts[0]:]
            gen_row = " ".join(gen_list)

            curr_distribution = lm(gen_row)  # vector of probabilities
            curr_distribution /= np.sum(curr_distribution)

            selected_index = np.random.choice(range(len(index2word)), p=curr_distribution)
            curr_token = index2word[int(selected_index)]
            generated_sentence += ' ' + curr_token
            generated_tokens += 1

        return generated_sentence
    
    def generate_sample_greedy(self, lm, index2word):
        generated_sentence = self.get_starting_word()
        generated_tokens = 0
        curr_token = None
        
        while '</s>' not in generated_sentence and generated_tokens < self.maxTokens:
            #NEED TO CHOOSE A ROW ELEMENT -- LAST n-1 WORDS OF SENTENCE
            gen_list = generated_sentence.split()[-self.ngramCounts[0]:]
            gen_row = " ".join(gen_list)
            
            curr_distribution = lm(gen_row)  # vector of probabilities
            curr_distribution /= np.sum(curr_distribution)

            selected_index = np.argmax(np.random.random(curr_distribution.shape) * \
                                       (curr_distribution==curr_distribution.max()) )
            curr_token = index2word[int(selected_index)]
            generated_sentence += ' ' + curr_token
            generated_tokens += 1

        return generated_sentence
    
    def lm(self, wordA):
        line = []
        for wordB in self.textDict:
            line.append(self.model([wordA + " " + wordB, wordA]))
        return line

    def create_sentence(self, greedy=True, topk=False, maxTokens=25, k=2):
        self.k = k
        self.maxTokens = maxTokens
        #lm = lambda s: self.probabilities[self.word2index.get(s, -1), :]
        
        if greedy:
            sentence = self.generate_sample_greedy(self.lm, self.textDict)
        elif topk:
            sentence =  self.generate_sample_top_k(self.lm, self.textDict)
        else:
            sentence =  self.generate_sample(self.lm, self.textDict)
            
        return sentence.replace(" </s>", ".").replace("<s>", "")