# Info
This notebook works pretty differently from txt_from_web.ipynb. The other notebook is meant to get a bunch of text files for you to analyze. In theory, you would go through it, get what you want, and go back to it as-needed for more text. 

By contrast, this notebook is more interactive, and you're intended to fill certain blocks with your own code to make use of the main object: marky the Markov Model Manager! The functions you can make use of are described in the code blocks, but I'll also have a block with examples of the functions being used, and how you might make use of a Markov Model Manager object.

The main idea is that marky will take the folder you set in section 0 as `root`, create any missing folders, and allow you to handle file management and markov model building all in one place! You can:

* import text files and turn them into markov models
* import markov models from json files
* create new models and then export them as json files
* combine models and export the result as a json file
* create formatted sentence outputs for any model so you can test models and fiddle with settings

# 0. Universal Stuff - Run First

In [393]:
import re
import os
import markovify
import spacy
nlp = spacy.load("en_core_web_sm")

# defines class that extends markovify model by incorporating part-of-speech tagging
# note that using POSifiedText(string) over markovify.Text(string) will take a lot longer
class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        return ["::".join((word.orth_,word.pos_)) for word in nlp(sentence)]
    
    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence

# note the / at the end!! no matter how long the folder path, it needs to end in / or shit breaks
root = "data/"

# 1. Markov Model Manager

In [431]:
class markovManager(dict):
    def __init__(self, root_folder):
        '''
        tk
        '''
        self.root_folder = root_folder
        self.txts_folder = self.root_folder+"txts/"
        self.jsons_folder = self.root_folder+"jsons/"
        self.directory_setup()
        self.txts = self.folder_to_dict(self.txts_folder)
        self.jsons = self.folder_to_dict(self.jsons_folder)
        self.models = {key:self.json_interpreter(self.jsons[key]) for key in self.jsons.keys()}
    
    def directory_setup(self):
        '''
        tk
        '''
        for folder in [self.root_folder, self.txts_folder, self.jsons_folder]:
            if os.path.isdir(folder) == False:
                path_out = (os.getcwd()+"/"+folder).replace("\\", "/")
                print("creating {}".format(path_out))
                os.mkdir(folder)
    
    def file_reader(self, path):
        '''
        tk
        '''
        with open(path, 'r', errors="ignore") as f:
            content = f.read()
        return content
    
    def json_interpreter(self, json):
        '''
        tk
        '''
        if "::" in json:
            model = POSifiedText.from_json(json)
        else:
            model = markovify.Text.from_json(json)
        return model
    
    def folder_to_dict(self, folder):
        '''
        tk
        '''
        files = dict()
        filenames = os.listdir(folder)
        for filename in filenames:
            filename_noext, filename_ext = filename.rsplit(".")
            content = self.file_reader(folder+filename)
            files[filename_noext] = content
        return files
  
    def update(self):
        '''
        tk
        '''
        for (folder, dictionary) in [(self.txts_folder, self.txts), (self.jsons_folder, self.jsons)]:
            filenames = os.listdir(folder)
            for filename in filenames:
                filename_noext, filename_ext = filename.rsplit(".")
                if filename_noext not in dictionary.keys() and filename_ext == "txt":
                     dictionary[filename_noext] = self.file_reader(folder+filename)
                elif filename_noext not in dictionary.keys() and filename_ext == "json":
                    dictionary[filename_noext] = self.file_reader(folder+filename)
                    new_model = self.json_interpreter(dictionary[filename_noext])
                    self.models[filename_noext] = new_model
    
    def add_model(self, name, string, pos=False, state_size=2):
        '''
        tk
        '''
        if pos == False:
            model = markovify.Text(string, state_size=state_size)
        elif pos == True:
            model = POSifiedText(string, state_size=state_size)
        self.models[name] = model
        
    def combine_models(self, name, model_names, weights=False):
        '''
        tk
        '''
        if weights == False:
            weights = [1 for i in model_names]
        models = list()
        for model_name in model_names:
            models.append(self.models[model_name])
        model_combo = markovify.combine(models, weights)
        self.models[name] = model_combo
        
    def export_model(self, model_name):
        '''
        tk
        '''
        model = self.models[model_name]
        model_json = model.to_json()
        self.jsons[model_name] = model_json
        with open(self.jsons_folder+model_name+".json",'w') as f:
            f.write(model_json)
    
    def output(self, model_name):
        model = self.models[model_name]
        sentence = model.make_sentence()
        '''
        Receives a string and returns a string (generally, a single sentence from a larger corpus).
        Performs a sequence of removals/substitutions to normalize formatting and fix some common issues.
        Note that these changes were largely determined by the files I was using, so you may need to adjust them.
        '''
        sentence = sentence.replace(" ,",",").replace(" .",".").replace(r'(?<=[^:]) \.\.\.',"...").replace(":...",": ...")\
            .replace(" ?","?").replace(" !","!").replace(" ;",";").replace(" :",":").replace("  "," ").replace("   ","  ")\
            .replace("\n","").replace(" )", ")").replace("( ","(").replace(";;",";").replace("::",":")
        try: 
            # this regex pattern fixes apostrophe errors for contractions, e.g. "I 'm" or "do n't"
            patterns = ["[A-Za-z]* '[a-z]", "[A-Za-z]* [a-z]'[a-z]"]
            for pattern in patterns:
                matches = re.findall(pattern, sentence)
                for match in matches:
                    match_f = match.replace(" ","")
                    sentence = sentence.replace(match, match_f)
            # this regex pattern fixes apostrophe errors for questions and exclamations, e.g. "Hi!It's" or "Yes?This"
            odd_pattern = "[?!](?=[A-Z])"
            matches = re.findall(odd_pattern, sentence)
            for match in matches:
                sentence = sentence.replace(match, match+" ")
            if sentence[-1] == " ":
                sentence = sentence[0:-1]
        except:
            print("Sentence could not be generated, sorry. :( Try again, it might just be the model!")
        return sentence

In [432]:
marky = markovManager(root)

In [435]:
marky.add_model("gigi_transcripts_nopos", marky.txts["gigi_transcripts"])
marky.combine_models("shabnak1", ["mystery", "gigi_transcripts_nopos"], [1, 1])

In [436]:
marky.output("shabnak1")

'Pulling back the loaded drinks to Burton and then tossed the open bottle in on an armed man.'