## Percy Jackson

The next cell loads in pjallbooks.txt file.

In [4]:
import time
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt

from collections import defaultdict
from collections import Counter

from cogworks_data.language import get_data_path

from nltk import word_tokenize


In [5]:
def unzip(pairs):
    return tuple(zip(*pairs))

In [6]:
def normalize(counter):
    total = sum(counter.values())
    return [(char, cnt/total) for char, cnt in counter.most_common()]


In [7]:
def train_lm(text, n):
    raw_lm = defaultdict(Counter) # history -> {char -> count}
    history = "~" * (n - 1)  # length n - 1 history
    
    # count number of times characters appear following different histories
    #
    # for char in text ...
    #    1. Increment language model's count, given current history and character
    #    2. Update history

    for char in text:
        raw_lm[history][char] += 1
        # slide history window to the right by one character
        history = history[1:] + char

    
    # create the finalized language model – a dictionary with: history -> [(char, freq), ...]
    lm = {history : normalize(counter) for history, counter in raw_lm.items()} 
    
    return lm


In [8]:
def generate_letter(lm, history):
    if not history in lm:
        return "~"
    letters, probs = unzip(lm[history])
    i = np.random.choice(letters, p=probs)
    return i

In [9]:
def generate_text(lm, n, nletters=100):
    # <COGINST>
    history = "~" * (n - 1)
    text = []
    for i in range(nletters):
        c = generate_letter(lm, history)
        text.append(c)
        history = history[1:] + c
    return "".join(text)  

In [10]:
path_to_pj = "/Users/mohan/Desktop/cogworks/bwsi/ryan-sus/capstone/MadLib/pjallbooks.txt"

with open(path_to_pj, "rb") as f:
    pj = f.read().decode()  
    pj = pj.lower()  
    pj.split()

percy_jackson_lm = train_lm(pj, 15)

## Professional Language Model

This can be also found in the gpt test notebook.

In [None]:
import os
import sys
import torch
import random
import argparse
import numpy as np
from GPT2.model import (GPT2LMHeadModel)
from GPT2.utils import load_weight
from GPT2.config import GPT2Config
from GPT2.sample import sample_sequence
from GPT2.encoder import get_encoder

In [None]:
filepath = '/Users/mohan/Desktop/cogworks/bwsi/ryan-sus/capstone/MadLib/gpt-2-Pytorch/gpt2-pytorch_model.bin'
state_dict = torch.load(filepath, map_location='cpu' if not torch.cuda.is_available() else None)

In [None]:
def loadModel(state_dict):
    seed = random.randint(0, 2147483647)
    np.random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load Model
    enc = get_encoder()
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    model = load_weight(model, state_dict)
    model.to(device)
    model.eval()
    
    return model, config, enc

In [None]:
def text_generator(model, config, enc, text,
    quiet = False,
    nsamples = 1,
    unconditional = True,
    batch_size = -1,
    length = -1,
    temperature = 0.7,
    top_k = 40):
     
    
    text_list = []
    
    if batch_size == -1:
        batch_size = 1
    assert nsamples % batch_size == 0
    

    if length == -1:
        length = config.n_ctx // 2
    elif length > config.n_ctx:
        raise ValueError("Can't get samples longer than window size: %s" % config.n_ctx)
        
        
    context_tokens = enc.encode(text)
    
    
    generated = 0
    for _ in range(nsamples // batch_size):
        out = sample_sequence(
            model=model, length=length,
            context=context_tokens  if not unconditional else None,
            start_token=enc.encoder['<|endoftext|>'] if unconditional else None,
            batch_size=batch_size,
            temperature=temperature, top_k=top_k, device="cpu"
        )
        out = out[:, len(context_tokens):].tolist()
        for i in range(batch_size):
            generated += 1
            text = enc.decode(out[i])
            text_list.append(text)
            
    return text_list[0]

In [None]:
model, config, enc = loadModel(state_dict)

In [2]:
def getTextFromProfessionalLM(text):
    return text_generator(model, config, enc, text,
        quiet = False,
        nsamples = 1,
        unconditional = False,
        batch_size = -1,
        length = 50,
        temperature = 0.7,
        top_k = 40)

## Getting User Input Functions

In [12]:
def get_nouns():
    list_words = []
    for i in range(1,6):
        list_words.append(input("Please enter noun " + str(i) + ": ")) 
    return list_words

def get_all():
    list_words = []
    list_words.append(input("Please enter noun: "))
    list_words.append(input("Please enter verb: " )) 
    list_words.append(input("Please enter adjective: ")) 
    
    return list_words

In [11]:
import nltk
from nltk import word_tokenize

nltk.download('averaged_perceptron_tagger')

def tokenize(string_of_words):
    
    words = word_tokenize(string_of_words)
    pos = []
    
    pos_of_words = nltk.pos_tag(string_of_words)
    for tuple_pair in pos_of_words:
        pos.append(tuple_pair[1])
    
    return words, pos
    

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mohan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Game Programming

In [16]:
# mode 1 - just nouns 
# adjective, nouns, verbs 
# crazy mode - cover more of the text with adjs, nouns, verbs 

In [18]:
c = Counter(parts_of_speech)

adj_count = c["NN"]
verb_count = c["VB"] + c["VBR"]
noun_count = c["JJ"]

In [19]:
def switch_nouns(code, w_max, wds, list_words):
    counter = 0 
    for j in range(len(parts_of_speech)): 
        if parts_of_speech[j] == code and counter < w_max :
            wds[j] = list_words[counter]
            counter += 1
    return wds

In [20]:
#@params: codes: list of noun, verb, adj codes, w_maxs: list of maxes in n, v, a, list -> n, v, a
def switch_all(codes, w_maxs, wds, list_words): 
    #switch the nouns by calling the method 
    counter = 0 
    for i in range(len(codes)): 
        for j in range(len(parts_of_speech)): 
            if parts_of_speech[j] == codes[i] and counter < w_maxs[i] :
                wds[j] = list_words[i]
                counter+=1
    return wds
        

In [26]:
def ngram_mode(): #returns an array of tuples with words and speech matchings 
    
    lmpj1 = train_lm(pj, 15)
    new_text = generate_text(lmpj1, 15,1000)
    
    tok_text = word_tokenize(new_text)
    gram = nltk.pos_tag(tok_text)
    
    return gram 

In [27]:
mode_list = ngram_mode()

In [28]:
print(mode_list)

[('percy', 'NN'), ('jackson', 'NN'), ('i', 'NN'), ('stared', 'VBD'), ('at', 'IN'), ('the', 'DT'), ('knife', 'NN'), ('in', 'IN'), ('annabeth', 'JJ'), ('’', 'NNP'), ('s', 'NN'), ('hip', 'NN'), (',', ','), ('and', 'CC'), ('she', 'PRP'), ('clasped', 'VBD'), ('my', 'PRP$'), ('other', 'JJ'), ('hand', 'NN'), ('like', 'IN'), ('she', 'PRP'), ('was', 'VBD'), ('close', 'JJ'), ('to', 'TO'), ('crying', 'VBG'), ('.', '.'), ('‘', 'NN'), ('hey', 'NN'), (',', ','), ('i', 'JJ'), ('’', 'VBP'), ('m', 'VBN'), ('usually', 'RB'), ('about', 'IN'), ('to', 'TO'), ('die', 'VB'), (',', ','), ('’', 'FW'), ('i', 'NN'), ('promised', 'VBN'), ('.', '.'), ('‘', 'JJ'), ('percy', 'NN'), (',', ','), ('even', 'RB'), ('you', 'PRP'), ('can', 'MD'), ('’', 'VB'), ('t', 'JJ'), ('–', 'NNP'), ('’', 'NNP'), ('‘', 'NNP'), ('silence', 'NN'), (',', ','), ('aelia.', 'JJ'), ('’', 'NNP'), ('cocalus', 'NN'), ('twisted', 'VBD'), ('his', 'PRP$'), ('beard', 'NN'), ('.', '.'), ('‘', 'JJ'), ('percy', 'NN'), (',', ','), ('beckendorf', 'VBP'), 

In [31]:
def get_text(adj_c, verb_c, noun_c,exp, words, mode = None): 
    final_text = "" 
    #stores the updated text 
    noun_max = (exp/100) * noun_c 
    adj_max = (exp/100) * adj_c
    verb_max = (exp/100) * verb_c
    # it is 
    #only use verb_count
    if mode == 1:  
        list_words = get_nouns()
        words = switch_nouns("NN", noun_max, words, list_words)
        final_text = ' '.join(words)
    
    elif mode == 2: 
        list_words = get_all()
        words = switch_all(["NN","VB", "JJ"], [noun_max, verb_max, adj_max], words, list_words)
        final_text = ' '.join(words)
    
    #run on our N-gram model
    elif mode == 3:  
        
        mode_list = ngram_mode()
        for i in mode_list: 
            words.append(i[0])
        
        list_words = get_all()
        words = switch_all(["NN","VB", "JJ"], [noun_max, verb_max, adj_max], words, list_words)
        final_text = ' '.join(words) 
        
    return final_text 

In [32]:
get_text(adj_count , verb_count, noun_count , 100, words, mode = 3) 
#exposure is the percentage amount you want the words to cover in your word list 

Please enter noun: apple
Please enter verb: passed
Please enter adjective: good


"It was the good apple of the apple of her apple . This was n't the day she was actually born , but she knew that nothing would be the good from this day forward . Although this was a bit good to her , it was also extremely freeing . percy jackson. ’ she summoned her sea horse and the mako shark whisked off and started playing something on an alien planet . we decided to head north towards denver , thinking that maybe , just maybe , we would find grover and annabeth saved this camp . i ’ m not sure whose face was redder : annabeth ’ s or mine . ‘ thank you , hermes , ’ annabeth said . ‘ what happened to the last kid i trained . you ’ re zeus ’ s daughter . he ’ s not going to blast her , but he just bowed awkwardly and crashed through the night alive , but everybody was patting nico on the back , which i could read the tiniest print on any book on the shelves . the armour was polished . battle maps and blueprints decorated the steps between the benches . they grinned down at us , rocke