In [4]:
from collections import defaultdict, Counter
import numpy as np
import random
import bisect

In [93]:
class character_model:
    def __init__(self, filename, numberLines=20, minimumOccurCount=1):
        self.characters = []
        self.content = []
        self.numberLines = numberLines
        self.minimumOccurCount = minimumOccurCount
        self.clean_input_document(filename)
        self.generate_character_order()
        
    def clean_input_document(self, filename):
        characters = []
        
        with open(filename) as f:
            for line in f:
                line = line.lower().split("|")
                if len(line) == 2:
                    characters.append("action")
                elif len(line) == 3:
                    characters.append(line[1].strip())

        #REMOVE CHARACTERS WHO DONT MEET MINIMUM
        lowCountChar = [char for char, occur in Counter(characters).items() \
                                        if occur < self.minimumOccurCount]
        characters = [char for char in characters if char not in lowCountChar]
        self.characters = characters
        self.create_character_model()
    
    def create_character_model(self):  
        #FIRST, FOR EACH CHARACTER, RECORD WHO FOLLOWS THEM IN THE SCRIPT
        character_follow = defaultdict(list)
        for i in range(1,len(self.characters)):
            if self.characters[i-1] == self.characters[i]:
                continue
            character_follow[self.characters[i-1]].append(self.characters[i])

        #NEXT, COUNT THE OCCURANCES FOR EACH PERSON WHO FOLLOWS THEM IN THE SCRIPT
        character_follow_count = {}
        for k, v in character_follow.items():
            character_follow_count[k] = ( Counter(v), len(v) )

        #THEN, CREATE TWO LISTS: CHARACTERS AND THEIR CHANCE OF BEING SELECTED
        #EX -> {'KK':3, 'TT':2} => ['KK', 'TT'], [3/5, 2/5]
        char_probs = {}
        for char, counts in character_follow_count.items():
            counter, total = counts
            charList = []
            probList = []
            
            for next_char, next_char_occur in counter.items():
                charList.append(next_char)
                probList.append(round( next_char_occur/total, 4) )
            
            char_probs[char] = (charList, probList)

        #FINALLY, GENERATE A WEIGHTED SAMPLER FROM THE CHAR LIST AND PROB LIST FOR EACH CHARACTER
        character_model = {}
        for char, lists in char_probs.items():
            character_model[char] = self.weighted_sampler(lists[0], lists[1])
        self.character_model = character_model
        
    def weighted_sampler(self, charList, probList):
        totals = []
        for p in probList:
            totals.append(p + totals[-1] if totals else p)
        return lambda: charList[bisect.bisect(totals, random.uniform(0, totals[-1]))]

    #SELECT A CHARACTER TO FOLLOW THE GIVEN CHARACTER X
    def next_character(self, x):
        return self.character_model[x]()
    
    def generate_character_order(self):
        #RANDOMLY CHOOSE FIRST CHARACTER -- WILL EDIT LATER TO BE WITH <t>
        first = random.choice(self.characters)
        order = [first]
        
        for i in range(self.numberLines - 1):
            order.append(self.next_character(order[i]))
        self.ordering = order
    
    def get_ordering(self):
        return self.ordering

In [94]:
def generateScript(filename, interactionCount, ngram, maxTokens=25, topk=False, minimumOccurCount=1):
    ch = character_model(filename, numberLines=interactionCount, minimumOccurCount=minimumOccurCount)
    scriptOrder = ch.get_ordering()
    sentence_models = {}
    for character in scriptOrder:
        if character not in sentence_models:
            sentence_models[character] = sentence_model(filename, character, ngram=ngram)
        print(character.upper() + " -- " )
        print( "\t" + sentence_models[character].create_sentence(maxTokens=maxTokens, topk=topk))


In [252]:
generateScript("./data/season1_22.txt", 15, 4, 10, minimumOccurCount=2)

PLANKTON -- 
	 i win i game bow around cant torture basss i lets yes
SPONGEBOB -- 
	 hey. jelly with bet song em hi friend tall help rotten
MR. KRABS -- 
	 hang on private speak eye he wont be go spot hang ah
PLANKTON -- 
	 our special today now just when friendship yoo getting finally popcorn yee
SPONGEBOB -- 
	 last one won accept krabs far mean thought see like have back
PLANKTON -- 
	 no not should realize enough just patty what hoo customer basss oh
KAREN -- 
	 sounds like see when take fun take leave do this see revenge
PLANKTON -- 
	 wh what win when much was krusty thermonuclear much saving loser special
SPONGEBOB -- 
	 so i zone out.
MR. KRABS -- 
	 look at will reach at recipe hes mutiny suit out go him
NARRATOR -- 
	 everyone is its their is perhaps squarepants at one enjoying can to
PLANKTON -- 
	 i tricked enough was spongebuddy appetite help inside induce bubble some play
KAREN -- 
	 sounds like cant then going at sounds at krabby time is are
PLANKTON -- 
	 aah. well fina

In [249]:
class sentence_model:
    def __init__(self, filename, character, ngram=2, alpha=.1):
        self.ngram = ngram
        self.ngramCounts = [i for i in range(ngram - 1, ngram + 1)]
        self.filename = filename
        self.character = character
        self.alpha = alpha
        
        self.textContent = self.clean_input_document()
        self.textDict = list( set(word for line in self.textContent for word in line.split(" ") if word) )
        self.textDictSize = len(self.textDict)
        self.wordCounts = self.count_words(self.textContent)
        self.word2index = {w: i for i, w in enumerate(self.wordCounts[self.ngramCounts[0]].keys())}
        self.probabilities = self.get_probability_matrix()

        
    def clean_input_document(self):
        with open(self.filename) as f:
            contents = f.read().lower()
        translate = "?:!-\n"
        replace = "...  "
        delete = ",;_()\""
        table = contents.maketrans(translate, replace, delete)

        #SPLIT SO EACH ELEMENT IS EITHER A CHARACTER OR ACTION
        contents = contents.translate(table).strip().split("}} {{l|")
        contents[0] = contents[0].replace("{{l|","")
        contents[-1] = contents[-1][:contents[-1].index('}')] 

        #REMOVE ACTIONS FROM CHARACTER LINES
        for i, line in enumerate(contents):
            if line[0:3] == "''[":
                contents[i] = "action|" + line[3:-3]
                continue
            while "''[" in line:
                line = line[:line.index("''[")] + line[line.index("]''") + 3:]
            contents[i] = line

        #KEEP ONLY IMPORTANT CHARACTERS TEXT
        
        relevant_char = self.character
        char_lines = []
        for line in contents:
            line = line.split("|")
            if line[0] != relevant_char:
                continue

            #ADD SENTENCE TAGS
            text = line[1].strip().replace("'", "").replace('[',"").replace(']', "").split(".")
            text = ["<s> " + t.strip() + " </s>" for t in text if t]
            char_lines.extend(text)
        return char_lines
            
    def count_words(self, contents):
        """Iterate through the contents and gather the counts of words"""
        wordCounts = {}
        for i in self.ngramCounts:
            if i == 0: # want the default to be the size of the corpus
                total = 0
                for line in contents:
                    words = line.split(" ")
                    words = [ w.strip() for w in words if w] #remove nulls
                    for word in words:
                        if word:
                            total += 1
                wordCounts[i] = defaultdict(lambda: total)
                continue
            else:
                counts = defaultdict(lambda: 0)
            for line in contents:
                words = line.split(" ")
                words = [ w.strip() for w in words if w] #remove nulls
                for k, word in enumerate(words): 
                    if k < (i-1) or not word:
                        continue
                    key = ""
                    for j in range(k-i+1, k+1):
                        key += words[j] + " "
                    counts[key.strip()] += 1
            wordCounts[i] = counts
        return wordCounts

    def model(self, x):
        return (self.wordCounts[self.ngramCounts[1]][x[0]] + self.alpha) / \
                    ( self.wordCounts[self.ngramCounts[0]][x[1]] + self.textDictSize*self.alpha )
    
    def get_probability_matrix(self):
        probabilities = []
        for wordA in self.wordCounts[self.ngramCounts[0]].keys():
            line = []
            for wordB in self.textDict:
                line.append(self.model([wordA + " " + wordB, wordA]))
            probabilities.append(line)
        return np.array(probabilities)
    
    def get_starting_word(self):
        potential = []
        for phrase in self.wordCounts[self.ngramCounts[0]].keys():
            if phrase[:3] == "<s>":
                potential.append(phrase)
        return random.choice(potential)

    def generate_sample_top_k(self, lm, index2word):
        """ Taken from - http://veredshwartz.blogspot.com/2019/08/text-generation.html
        Generates a string, sample a word from the top k probable words in the distribution at each time step.
        :param lm - the language model
        :param index2word - a mapping from the index of a word in the vocabulary to the word itself
        :param k - how many words to keep in the distribution """ 

        generated_sentence = self.get_starting_word()
        curr_token = None
        generated_tokens = 0

        while curr_token != '</s>' and generated_tokens < self.maxTokens:
            #NEED TO CHOOSE A ROW ELEMENT -- LAST n-1 WORDS OF SENTENCE
            gen_list = generated_sentence.split()[-self.ngramCounts[0]:]
            gen_row = " ".join(gen_list)
  
            curr_distribution = lm(gen_row)  # vector of probabilities
            sorted_by_probability = np.argsort(curr_distribution) # sort by probability
            top_k = sorted_by_probability[-(self.k+1):] # keep the top k words

            k_index2Word = [] #grab the top k words associated with top_k probabilities
            for index in top_k:
                k_index2Word.append(index2word[index])

            # normalize to make it a probability distribution again
            top_k = top_k / np.sum(top_k)

            selected_index = np.random.choice(range(len(k_index2Word)), p=top_k)
            curr_token = k_index2Word[int(selected_index)]
            generated_sentence += ' ' + curr_token   
            generated_tokens += 1

        return generated_sentence


    def generate_sample(self, lm, index2word):
        """ Taken from http://veredshwartz.blogspot.com/2019/08/text-generation.html
        Generates a string, sample a word from the distribution at each time step.
        :param lm - the language model
        :param index2word - a mapping from the index of a word in the vocabulary to the word itself """ 
        
        generated_sentence = self.get_starting_word()
        generated_tokens = 0
        curr_token = None

        while curr_token != '</s>' and generated_tokens < self.maxTokens:
            #NEED TO CHOOSE A ROW ELEMENT -- LAST n-1 WORDS OF SENTENCE
            gen_list = generated_sentence.split()[-self.ngramCounts[0]:]
            gen_row = " ".join(gen_list)

            curr_distribution = lm(gen_row)  # vector of probabilities
            curr_distribution /= np.sum(curr_distribution)

            selected_index = np.random.choice(range(len(index2word)), p=curr_distribution)
            curr_token = index2word[int(selected_index)]
            generated_sentence += ' ' + curr_token
            generated_tokens += 1

        return generated_sentence

    def create_sentence(self, topk=False, maxTokens=25, k=2):
        self.k = k
        self.maxTokens = maxTokens
        lm = lambda s: self.probabilities[self.word2index.get(s, -1), :]
        
        if topk:
            sentence =  self.generate_sample_top_k(lm, self.textDict)
        else:
            sentence =  self.generate_sample(lm, self.textDict)
            
        return sentence.replace(" </s>", ".").replace("<s>", "")