In [1]:
import time
import numpy as np
%matplotlib notebook
import matplotlib.pyplot as plt


In [2]:
def unzip(pairs):
    return tuple(zip(*pairs))

In [3]:
def normalize(counter):
    total = sum(counter.values())
    return [(char, cnt/total) for char, cnt in counter.most_common()]


In [4]:
from collections import defaultdict
from collections import Counter

def train_lm(text, n):
    raw_lm = defaultdict(Counter) # history -> {char -> count}
    history = "~" * (n - 1)  # length n - 1 history
    
    # count number of times characters appear following different histories
    #
    # for char in text ...
    #    1. Increment language model's count, given current history and character
    #    2. Update history

    for char in text:
        raw_lm[history][char] += 1
        # slide history window to the right by one character
        history = history[1:] + char

    
    # create the finalized language model – a dictionary with: history -> [(char, freq), ...]
    lm = {history : normalize(counter) for history, counter in raw_lm.items()} 
    
    return lm


In [5]:
def generate_letter(lm, history):
    if not history in lm:
        return "~"
    letters, probs = unzip(lm[history])
    i = np.random.choice(letters, p=probs)
    return i

In [6]:
def generate_text(lm, n, nletters=100):
    # <COGINST>
    history = "~" * (n - 1)
    text = []
    for i in range(nletters):
        c = generate_letter(lm, history)
        text.append(c)
        history = history[1:] + c
    return "".join(text)  

## 2. Generating "The Percy Jackson Series"

The next cell loads in pjallbooks.txt file.

In [7]:
from cogworks_data.language import get_data_path

path_to_pj = "/Users/manyadua/Downloads/ryan-sus/capstone/MadLib/pjallbooks.txt"

#get_data_path("pjolympians.txt")

with open(path_to_pj, "rb") as f:
    pj = f.read().decode()  
    pj = pj.lower()  
    pj.split()



In [8]:
new_text =" It was the first day of the rest of her life. This wasn't the day she was actually born, but she knew that nothing would be the same from this day forward. Although this was a bit scary to her, it was also extremely freeing."

In [9]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/manyadua/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
from nltk import word_tokenize

In [11]:
tok_text = word_tokenize(new_text)
gram = nltk.pos_tag(tok_text)
print(gram)

[('It', 'PRP'), ('was', 'VBD'), ('the', 'DT'), ('first', 'JJ'), ('day', 'NN'), ('of', 'IN'), ('the', 'DT'), ('rest', 'NN'), ('of', 'IN'), ('her', 'PRP$'), ('life', 'NN'), ('.', '.'), ('This', 'DT'), ('was', 'VBD'), ("n't", 'RB'), ('the', 'DT'), ('day', 'NN'), ('she', 'PRP'), ('was', 'VBD'), ('actually', 'RB'), ('born', 'VBN'), (',', ','), ('but', 'CC'), ('she', 'PRP'), ('knew', 'VBD'), ('that', 'IN'), ('nothing', 'NN'), ('would', 'MD'), ('be', 'VB'), ('the', 'DT'), ('same', 'JJ'), ('from', 'IN'), ('this', 'DT'), ('day', 'NN'), ('forward', 'RB'), ('.', '.'), ('Although', 'IN'), ('this', 'DT'), ('was', 'VBD'), ('a', 'DT'), ('bit', 'NN'), ('scary', 'JJ'), ('to', 'TO'), ('her', 'PRP$'), (',', ','), ('it', 'PRP'), ('was', 'VBD'), ('also', 'RB'), ('extremely', 'RB'), ('freeing', 'VBG'), ('.', '.')]


In [12]:
list_words = []
for i in range(1,6):
    list_words.append(input("Please enter noun " + str(i) + ": ")) 

Please enter noun 1: apple
Please enter noun 2: orange
Please enter noun 3: banana
Please enter noun 4: grapes
Please enter noun 5: peach


In [13]:

# ## [mnay, jakjdfl, ajdfkj]

# counter = 0 
# for i in range(len(gram)):
#     if gram[i][1] == "NN" and counter < 5: 
#         gram[i][0] = list_words[counter]
#         counter += 1
# print(gram)
        

In [20]:
## tokenize two arrays 
## it 
## NN 

words = [] 
parts_of_speech = []

for i in gram: 
    words.append(i[0])
    parts_of_speech.append(i[1])

print(words)
print(parts_of_speech)



    

['It', 'was', 'the', 'first', 'day', 'of', 'the', 'rest', 'of', 'her', 'life', '.', 'This', 'was', "n't", 'the', 'day', 'she', 'was', 'actually', 'born', ',', 'but', 'she', 'knew', 'that', 'nothing', 'would', 'be', 'the', 'same', 'from', 'this', 'day', 'forward', '.', 'Although', 'this', 'was', 'a', 'bit', 'scary', 'to', 'her', ',', 'it', 'was', 'also', 'extremely', 'freeing', '.']
['PRP', 'VBD', 'DT', 'JJ', 'NN', 'IN', 'DT', 'NN', 'IN', 'PRP$', 'NN', '.', 'DT', 'VBD', 'RB', 'DT', 'NN', 'PRP', 'VBD', 'RB', 'VBN', ',', 'CC', 'PRP', 'VBD', 'IN', 'NN', 'MD', 'VB', 'DT', 'JJ', 'IN', 'DT', 'NN', 'RB', '.', 'IN', 'DT', 'VBD', 'DT', 'NN', 'JJ', 'TO', 'PRP$', ',', 'PRP', 'VBD', 'RB', 'RB', 'VBG', '.']


In [21]:
# mode 1 - just nouns 
# adjective, nouns, verbs 
# crazy mode - cover more of the text with adjs, nouns, verbs 

In [22]:
from collections import Counter

In [23]:
c = Counter(parts_of_speech)

adj_count = c["NN"]
verb_count = c["VB"] + c["VBR"]
noun_count = c["JJ"]

In [36]:
def switch_words(code, w_max, wds):
    counter = 0 
    for j in range(len(parts_of_speech)): 
        if parts_of_speech[j] == code and counter < w_max :
            wds[j] = list_words[counter]
            counter += 1
    return wds

In [37]:
def ngram_mode(): #returns an array of tuples with words and speech matchings 
    
    lmpj1 = train_lm(pj, 15)
    new_text = generate_text(lmpj1, 15,1000)
    new_text = new_text.split()
    
    tok_text = word_tokenize(new_text)
    gram = nltk.pos_tag(tok_text)
    
    return gram 

In [38]:
def get_text(adj_c, verb_c, noun_c,exp, words, mode = None): 
    final_text = "" #stores the updated text 
    noun_max = (exp/100) * noun_c 
    adj_max = (exp/100) * adj_c
    verb_max = (exp/100) * verb_c
    # it is 
    if mode == 1: #only use verb_count 
        words = switch_words("NN", noun_max, words)
        final_text = ' '.join(words)
    
    elif mode == 2: 
        #nouns are already updated, update verbs + adjs
        words = switch_words("NN", noun_max, words)
        words = switch_words("VB", noun_max, words)
        words = switch_words("JJ", noun_max, words)

        final_text = ' '.join(words)
        
    elif mode == 3: #run on our N-gram model 
        
        mode_list = ngram_model()
        for i in mode_list: 
            words.append(i[0])
            
        words = switch_words("NN", noun_max, words)
        words = switch_words("VB", noun_max, words)
        words = switch_words("JJ", noun_max, words)

        final_text = ' '.join(words) 
        
    return final_text 

In [39]:
get_text(adj_count , verb_count, noun_count , 25, words, mode = 1) 
#exposure is the percentage amount you want the words to cover in your word list 

"It was the first apple of the rest of her life . This was n't the day she was actually born , but she knew that nothing would be the same from this day forward . Although this was a bit scary to her , it was also extremely freeing ."

In [None]:
# #counter = 0 
# for value in gram: # (word, part of speech)
#     if value[1] == "NN" and counter < 5: 
#         value[0] = list_words[counter]
#         counter += 1
        
# print(gram)

In [23]:
# def Convert(tup, di):
#     di = dict(tup)
#     return di
# # Driver Code    
# gram_dict = {}
# Convert(gram, gram_dict)



In [24]:
# from collections import defaultdict

# d = defaultdict(list)
# for key, value in gram:
#     d[key].append(value)

In [25]:
# print(d)

defaultdict(<class 'list'>, {'It': ['PRP'], 'was': ['VBD', 'VBD', 'VBD', 'VBD', 'VBD'], 'the': ['DT', 'DT', 'DT', 'DT'], 'first': ['JJ'], 'day': ['NN', 'NN', 'NN'], 'of': ['IN', 'IN'], 'rest': ['NN'], 'her': ['PRP$', 'PRP$'], 'life': ['NN'], '.': ['.', '.', '.'], 'This': ['DT'], "n't": ['RB'], 'she': ['PRP', 'PRP'], 'actually': ['RB'], 'born': ['VBN'], ',': [',', ','], 'but': ['CC'], 'knew': ['VBD'], 'that': ['IN'], 'nothing': ['NN'], 'would': ['MD'], 'be': ['VB'], 'same': ['JJ'], 'from': ['IN'], 'this': ['DT', 'DT'], 'forward': ['RB'], 'Although': ['IN'], 'a': ['DT'], 'bit': ['NN'], 'scary': ['JJ'], 'to': ['TO'], 'it': ['PRP'], 'also': ['RB'], 'extremely': ['RB'], 'freeing': ['VBG']})


In [None]:
# list_words = []
# for i in range(1,6):
#     list_words.append(input("Please enter noun " + str(i) + ": ")) 

Please enter noun 1: 
Please enter noun 2: 


In [8]:
# for i in range(len(list_words)):
#     print(list_words[i])

grapes
orange
peach
banana
apple


NameError: name 'g' is not defined

In [None]:
"""i_of_nouns = []

for key, value in list(g.items()):
    if value == 'NN':
            del g[key]
   
   """

## Working on making the actual game

1. remove random nouns using spacy
2. find the pos of the random words
3. either leave word blank for user to fill in
4. or have computer generate words

## 3. Generating "Dr. Suess' Work"

The next cell loads in drsuess.txt file.

In [30]:
import pandas as pd

path_to_seuss = "/Users/mohan/Desktop/cogworks/bwsi/ryan-sus/capstone/MadLib/drseuss.txt"

seuss = pd.read_csv(path_to_seuss, encoding= 'unicode_escape')

#with open(path_to_suess, "rb") as f:
seuss = seuss.read().decode()  
seuss = seuss.lower()
seuss.split()
    


print(str(len(suess)) + " character(s)")

ParserError: Error tokenizing data. C error: Expected 1 fields in line 8, saw 3
