In [1]:
import random
import numpy as np
from tqdm import tqdm # Quality-of-life package, makes loading bars in for loops. See github.com/tqdm/tqdm for details.

import matplotlib.pyplot as plt
plt.style.use('classic') # Optional styling for the plots. I think it's pretty.

In [2]:
#Code adapted from https://pypi.org/project/Gutenberg/ to help download from 
#Project Gutenberg (www.gutenberg.org/) automatically. You don't need to read this.

def _format_download_urls(etextno):
    """
    Returns the possible urls location on the Project Gutenberg servers for a
    given text. Mostly ripped from https://pypi.org/project/Gutenberg/.
    """
    uri_root = r'http://www.gutenberg.lib.md.us'

    if 0 < etextno < 10:
        oldstyle_files = (
            'when11',
            'bill11',
            'jfk11',
            'getty11',
            'const11',
            'liber11',
            'mayfl11',
            'linc211',
            'linc111',
        )
        etextno = int(etextno)
        return '{root}/etext90/{name}.txt'.format(
            root=uri_root,
            name=oldstyle_files[etextno - 1])

    else:
        etextno = str(etextno)
        extensions = ('.txt', '-8.txt', '-0.txt')
        urls = []
        for extension in extensions:
            uri = '{root}/{path}/{etextno}/{etextno}{extension}'.format(
                root=uri_root,
                path='/'.join(etextno[:len(etextno) - 1]),
                etextno=etextno,
                extension=extension)
            urls.append(uri)
        return urls

In [3]:
def download_from_book_id(bookid, bookname):
  """
  Downloads a from book from Project Gutenberg given the book's id number, and
  stores it locally in `bookname.txt`.
  
  :param      bookid:    The id of the book on Project Gutenberg
  :type       bookid:    int
  :param      bookname:  The name to give the book (or rather the file path to the book)
  :type       bookname:  str
  """

  book_dst = f'{bookname}.txt'

  import os
  from six.moves import urllib

  if os.path.isfile(book_dst):
      print('File %s is already downloaded' % book_dst)
  else:
      possible_urls = _format_download_urls(bookid)
      print(possible_urls)
      for url in possible_urls:
        print(f'trying {url}...')
        try:
          urllib.request.urlretrieve(url, book_dst)
          print(f'Downloaded {bookname}.txt, with book id {bookid}.')
          return
        except urllib.error.HTTPError:
          None
      raise NameError("Couldn't find that book on Gutenberg")


In [4]:
download_from_book_id(100, "Shakespeare")
download_from_book_id(5200, "Metamorphosis")
download_from_book_id(11, "Wonderland")
download_from_book_id(1184, "MonteCristo")

['http://www.gutenberg.lib.md.us/1/0/100/100.txt', 'http://www.gutenberg.lib.md.us/1/0/100/100-8.txt', 'http://www.gutenberg.lib.md.us/1/0/100/100-0.txt']
trying http://www.gutenberg.lib.md.us/1/0/100/100.txt...
trying http://www.gutenberg.lib.md.us/1/0/100/100-8.txt...
trying http://www.gutenberg.lib.md.us/1/0/100/100-0.txt...
Downloaded Shakespeare.txt, with book id 100.
['http://www.gutenberg.lib.md.us/5/2/0/5200/5200.txt', 'http://www.gutenberg.lib.md.us/5/2/0/5200/5200-8.txt', 'http://www.gutenberg.lib.md.us/5/2/0/5200/5200-0.txt']
trying http://www.gutenberg.lib.md.us/5/2/0/5200/5200.txt...
trying http://www.gutenberg.lib.md.us/5/2/0/5200/5200-8.txt...
trying http://www.gutenberg.lib.md.us/5/2/0/5200/5200-0.txt...
Downloaded Metamorphosis.txt, with book id 5200.
['http://www.gutenberg.lib.md.us/1/11/11.txt', 'http://www.gutenberg.lib.md.us/1/11/11-8.txt', 'http://www.gutenberg.lib.md.us/1/11/11-0.txt']
trying http://www.gutenberg.lib.md.us/1/11/11.txt...
trying http://www.gutenbe

In [5]:
file_handle = open("Wonderland.txt", encoding="utf8")  # It's very important to mention utf8 encoding
wonderland_book_text = file_handle.read()  # Copies the book as a string in memory
file_handle.close()

print(wonderland_book_text[0:1000])

﻿The Project Gutenberg eBook of Alice’s Adventures in Wonderland, by Lewis Carroll

This eBook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this eBook or online at
www.gutenberg.org. If you are not located in the United States, you
will have to check the laws of the country where you are located before
using this eBook.

Title: Alice’s Adventures in Wonderland

Author: Lewis Carroll

Release Date: January, 1991 [eBook #11]
[Most recently updated: October 12, 2020]

Language: English

Character set encoding: UTF-8

Produced by: Arthur DiBianca and David Widger

*** START OF THE PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***

[Illustration]




Alice’s Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents

 CHAPTER I.     Down the Rabbit-

In [6]:
def word_sequence_from_file(filepath):
    """
    Given a filepath to a text file for a Project Gutenberg book, this splits
    the book into a list of strings, where each string is a word from the book.
    Foratting data, like where \n or spaces happen is destroyed by this, but
    punctuation like "word." are preserved.

    :param      filepath:  The filepath to the book
    :type       filepath:  string

    :returns:   A list of strings without any whitespace.
    :rtype:     List of string
    """

    # Open the file
    file_handle = open(filepath, encoding="utf8")
    raw_book_text = file_handle.read()  # Copies the book as a string in memory
    file_handle.close()

    # TODO: Write this function.
    substring = "***"
    first_index = raw_book_text.find(substring)
    second_index = raw_book_text.find(substring, first_index + 1)
    third_index = raw_book_text.find(substring, second_index + 1)
    start_of_book_idx = second_index+3
    end_of_book_idx = third_index
    sliced_book_str = raw_book_text[start_of_book_idx:end_of_book_idx]
    word_sequence = sliced_book_str.split()
    
    return word_sequence

In [7]:
shakespeare = word_sequence_from_file("Shakespeare.txt")
metamorphosis = word_sequence_from_file("Metamorphosis.txt")
wonderland = word_sequence_from_file("Wonderland.txt")
montecristo = word_sequence_from_file("MonteCristo.txt")

In [8]:
# Learning Markov Model
#A Markov Model of order  𝑘  predicts that each word occurs with some probability, but that probability depends on the previous  𝑘  consequtive words, called a k-gram.
def make_freq_dict(k, word_sequence):
    # Make empty dict to count frequencies
    freq_dict = dict()
    for i in range(len(word_sequence) - k):
      kgram = " ".join(word_sequence[i : i+k])
      next_word = word_sequence[i+k]
      if kgram not in freq_dict:
        freq_dict[kgram] = {}
      if next_word not in freq_dict[kgram]:
        freq_dict[kgram][next_word] = 0
      freq_dict[kgram][next_word] = freq_dict[kgram][next_word] + 1
    return freq_dict

In [9]:
from six import b
def predict_next_word(this_kgram, freq_dict):

    if this_kgram not in freq_dict:
      return None
    # Otherwise, get the dictionary for this kgram
    this_dict = freq_dict[this_kgram]
    keys = list(this_dict.keys())
    vals = list(this_dict.values())
    ans = random.choices(keys, weights = vals, k = 1)
    return ans[0]


def predict_paragraph(start_kgram, k, freq_dict, gen_length=300):
    gen_para = start_kgram.copy()
    for i in range(gen_length):
      # just take the last k items in the gen_para
      predicted_word = predict_next_word(" ".join(gen_para[-k:]), freq_dict) 
      if predicted_word is None:
        break
      gen_para.append(predicted_word)
    return gen_para


In [10]:
# Pick k and make a frequency dictionary from Metamorphosis.
k_test = 3
freq_dict_test = make_freq_dict(k_test, metamorphosis)

# pick a random starting k-gram from the freq-dict.
start_test = random.choice(list(freq_dict_test.keys())).split()


# Alternatively, pick the start of the book as the first k-gram
# start_test = metamorphosis[0:k_test]

# Generate a paragraph, and print it.
gen_paragraph = predict_paragraph(start_test, k_test, freq_dict_test)
" ".join(gen_paragraph)

'not impeding Gregor as he lay there quietly a while longer, breathing lightly as if he could advise his sister like in the old days; but he had never felt before. “Oh, God”, he thought, “what a strenuous career it is that I’ve chosen! Travelling day in and day out. Doing business like this takes much more effort than doing your own business at home, and on top of the large amount of sewing work she did. Gregor even learned, listening to the evening conversation about what price they had hoped for, that several items of jewellery belonging to the family had been sold, even though both mother and sister would urge each other to be quiet; his mother, bent deeply under the lamp, would sew fancy underwear for a fashion shop; his sister, who had taken a room in this establishment, in the entire flat and especially in the kitchen. Nonetheless, Gregor’s father came into the room and waited. Gregor’s father soon appeared with the music stand, his mother with the music stand, his mother with th