In [1]:
# Imports
import sys
import sklearn

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.backend import manual_variable_initialization

import numpy as np
import os

import pickle

tf.get_logger().setLevel(4)

In [2]:
# Matplotlib
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt


## Loading the text data

In [3]:
filepath = 'TEP_talks.txt'
with open(filepath, encoding="utf-8") as f:
    TEP_text = f.read()

In [4]:
print(TEP_text[:1000])

Title: Bootstrapping Inflationary Fluctuations
Abstract: In flat space, four point scattering amplitudes at weak coupling can be fully determined from Lorentz symmetry, unitarity and causality. The resulting scattering amplitude depends on model details only through coupling constants and the particle content of the theory. I will show how the analogous story works in the case of inflationary fluctuations. We found explicit expressions for inflationary three and four-point functions, whose shapes depend on the field content of the theory, and do not depend on the specific inflationary model, as long as the fluctuations minimally break de Sitter symmetry. This "cosmological bootstrap" is a first step towards classifying a large set of shapes of primordial non-gaussianity, which can be searched for in experimental data.

Title: Two Dimensional de Sitter Gravity
Abstract: In the first part of this talk, I will analyze two dimensional Jackiw- Teitelboim gravity with positive cosmological c

In [5]:
''.join(sorted(set(TEP_text)))

'\n !"$%&\'()+,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{}~×éöˆ̈ℓ∗∼≤'

## Construct dataset

In [6]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(TEP_text)

In [7]:
tokenizer.texts_to_sequences(['Abstract:'])

[[32, 20, 8, 3, 9, 5, 11, 3, 31]]

In [8]:
tokenizer.sequences_to_texts([[32, 20, 8, 3, 9, 5, 12, 3, 31]])

['A b s t r a h t :']

In [9]:
max_id = len(tokenizer.word_index) # Number of distinct characters
dataset_size = tokenizer.document_count # Total number of characters

print('max_id = {0}'.format(max_id))
print('dataset_size = {0}'.format(dataset_size))

max_id = 100
dataset_size = 138420


In [10]:
[encoded] = np.array(tokenizer.texts_to_sequences([TEP_text])) - 1
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

## Chop dataset into windows

In [11]:
# Chop dataset into windows

n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

In [12]:
# Flatten windows
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [13]:
# Batch, shuffle
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [14]:
# Encoding characters with an embedding

# dataset = dataset.map(
#     lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

dataset = dataset.prefetch(1)

## Load the Char-RNN Model (Non-Stateful Model)

In [15]:
model = keras.models.load_model('TEP_Bot_Save_2020-05-11.h5')
# history = pickle.load(open("history_2020-05-02.pkl", "rb"))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 2)           200       
_________________________________________________________________
gru (GRU)                    (None, None, 128)         50688     
_________________________________________________________________
gru_1 (GRU)                  (None, None, 128)         99072     
_________________________________________________________________
time_distributed (TimeDistri (None, None, 100)         12900     
Total params: 162,860
Trainable params: 162,860
Non-trainable params: 0
_________________________________________________________________


In [16]:
# with open('TEP_Bot_2020-05-06_weights.pkl', 'rb') as in_pickle:
#     weights = pickle.load(in_pickle)

# model.set_weights(weights)
# 
# model.summary()

## Generating text from the model (Non-Stateful Model)

In [17]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return X

In [18]:
X_new = preprocess(["Abstract: In this tal"])
Y_pred = model.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] 

'k'

In [19]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [20]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [21]:
def write_and_print_text(text_out):
    print(text_out)
    
    with open('text_file_out_2020-05-11.txt', 'a') as text_file_out:
        text_file_out.write(text_out)

In [None]:
text_out = ''

text_out = '\n\n---\n\n' + complete_text('Abstract: ', n_chars=500, temperature=0.7)

write_and_print_text(text_out)

text_out = '\n\n---\n\n' + complete_text('Title: ', n_chars=500, temperature=0.7)

write_and_print_text(text_out)

text_out = '\n\n---\n\n' + complete_text('Abstract: ', n_chars=1000, temperature=0.7)

write_and_print_text(text_out)

text_out = '\n\n---\n\n' + complete_text('Title: ', n_chars=1000, temperature=0.7)

write_and_print_text(text_out)

