<a href="https://colab.research.google.com/github/cewbewus/creativecode/blob/master/novel/jest_of_the_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#I ran this originally with a text file containing both the full texts of *Infinite Jest* and *The Broom of the System.*

In [1]:
import random

#FROM WEASYPRINT TUTORIAL
!pip install weasyprint
from weasyprint import HTML

#INSTALL MARKOVIFY
!pip install -q markovify
import markovify

Collecting weasyprint
[?25l  Downloading https://files.pythonhosted.org/packages/ca/1a/1925cf1e39e5526d9c686a31f798523e06ea45f20c958a429317a1110ee4/WeasyPrint-50-py3-none-any.whl (356kB)
[K     |████████████████████████████████| 358kB 2.7MB/s 
[?25hCollecting Pyphen>=0.8
[?25l  Downloading https://files.pythonhosted.org/packages/15/82/08a3629dce8d1f3d91db843bb36d4d7db6b6269d5067259613a0d5c8a9db/Pyphen-0.9.5-py2.py3-none-any.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 49.3MB/s 
[?25hCollecting cairocffi>=0.9.0
[?25l  Downloading https://files.pythonhosted.org/packages/f7/99/b3a2c6393563ccbe081ffcceb359ec27a6227792c5169604c1bd8128031a/cairocffi-1.1.0.tar.gz (68kB)
[K     |████████████████████████████████| 71kB 8.0MB/s 
[?25hCollecting cssselect2>=0.1
  Downloading https://files.pythonhosted.org/packages/c4/a1/1a37602bbcfa2f7c079758f31555776a00a947e43457a3e0110b2165c7d9/cssselect2-0.2.2-py2.py3-none-any.whl
Collecting CairoSVG>=2.4.0
[?25l  Downloading https://fi

In [0]:
#FROM TENSORFLOW TEXT_GENERATION NOTEBOOK

from __future__ import absolute_import, division, print_function, unicode_literals

try:
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

import numpy as np
import os
import time

In [0]:
text = open('inf.txt', 'rb').read().decode(encoding='utf-8')

In [0]:
print ('Length of text: {} characters'.format(len(text)))

In [0]:
print(text[:250])

In [0]:
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

In [0]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [0]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

In [0]:
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

In [0]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

In [0]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

In [0]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [0]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

In [0]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

In [0]:
BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

In [0]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [0]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

In [0]:
model.summary()

In [0]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [0]:
sampled_indices

In [0]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

In [0]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

In [0]:
model.compile(optimizer='adam', loss=loss)

In [0]:
checkpoint_dir = './training_checkpoints'

checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [0]:
#takes about an hour for this novel
EPOCHS=50

In [0]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [0]:
#link to epoch with least loss
tf.train.latest_checkpoint("/content/training_checkpoints/ckpt_22.index")

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [0]:
model.summary()

In [0]:
def generate_text(model, start_string):

  num_generate = 2000000

  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  #since this text is abstract already, a lower setting works better.
  temperature = 0.5


  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [0]:
with open('jestsyst.txt', 'w') as f:
  f.write(generate_text(model, start_string=u"The"))

In [0]:
#FROM MARKOVIFY DEMO
novel = ''

with open("jestsyst.txt") as f:
    text = f.read()

text_model = markovify.Text(text)

for i in range(20000):
  novel += str(text_model.make_sentence()) + " "

  s = random.randint(0,100)
  if (s < 25):              
    novel += "<br>"
  if (s > 90):
    novel += "<br><br>"
  if (s == 75):
    novel += '<p style="page-break-before: always;"><h3>' + "Chapter " + f"{i + 1}" + ": " + str(text_model.make_short_sentence(200)) + '</h3></p><br><br>'

In [0]:
novelfin ='<h1>The Jest of the System</h1> <p> <h2>a novel by David Botster Wallace</h2> <p>'
novelfin += '<p style="page-break-before: always;"><h3>' + "Chapter " + f"{i + 1}" + ": " + str(text_model.make_short_sentence(200)) + '</h3></p><br><br>'
novelfin += novel

html_template = f"""
<html>
  <head>
  <title>The Jest of the System</title>
  <style>
    body {{
      font-family: "Arial";
    }}
  </style>
  </head>
  <body>
  {novelfin}
  </body>
</html>
"""

HTML(string=html_template).write_pdf("systemjest.pdf")