In [1]:
# Imports
import sys
import sklearn

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.backend import manual_variable_initialization
# manual_variable_initialization(True)

import numpy as np
import os

import pickle

print('TensorFlow version: {0}'.format(tf.__version__))

TensorFlow version: 2.0.0


In [2]:
save_files_location = '/home/idies/workspace/Storage/abhimat/persistent/TEP_Bot/'

In [3]:
# # Seeding

# np.random.seed(42)
# tf.random.set_seed(42)

In [4]:
# Pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "nlp"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## Loading the text data

In [5]:
filepath = save_files_location + 'TEP_talks.txt'
with open(filepath) as f:
    TEP_text = f.read()

In [6]:
print(TEP_text[:500])

Title: Bootstrapping Inflationary Fluctuations
Abstract: In flat space, four point scattering amplitudes at weak coupling can be fully determined from Lorentz symmetry, unitarity and causality. The resulting scattering amplitude depends on model details only through coupling constants and the particle content of the theory. I will show how the analogous story works in the case of inflationary fluctuations. We found explicit expressions for inflationary three and four-point functions, whose shape


In [7]:
''.join(sorted(set(TEP_text)))

'\n !"$%&\'()+,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_abcdefghijklmnopqrstuvwxyz{}~×éöˆ̈ℓ∗∼≤'

## Construct dataset

In [8]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True, lower=False)
tokenizer.fit_on_texts(TEP_text)

In [9]:
tokenizer.texts_to_sequences(['Abstract:'])

[[32, 20, 8, 3, 9, 5, 12, 3, 31]]

In [10]:
tokenizer.sequences_to_texts([[32, 20, 8, 3, 9, 5, 12, 3, 31]])

['A b s t r a c t :']

In [11]:
max_id = len(tokenizer.word_index) # Number of distinct characters
dataset_size = tokenizer.document_count # Total number of characters

print('max_id = {0}'.format(max_id))
print('dataset_size = {0}'.format(dataset_size))

max_id = 100
dataset_size = 136477


In [12]:
[encoded] = np.array(tokenizer.texts_to_sequences([TEP_text])) - 1
train_size = dataset_size * 95 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

dataset_validation = tf.data.Dataset.from_tensor_slices(encoded[train_size:])

## Chop dataset into windows

In [13]:
# Chop dataset into windows

n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead

dataset = dataset.window(window_length, shift=1, drop_remainder=True)
dataset_validation = dataset_validation.window(window_length, shift=1, drop_remainder=True)

In [14]:
# Flatten windows
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset_validation = dataset_validation.flat_map(lambda window: window.batch(window_length))

In [15]:
# Batch, shuffle
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))

dataset_validation = dataset_validation.shuffle(10000).batch(batch_size)
dataset_validation = dataset_validation.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))

dataset = dataset.prefetch(1)
dataset_validation = dataset_validation.prefetch(1)

## Char-RNN Model

In [16]:
num_oov_buckets = 0

model = keras.models.Sequential([
    keras.layers.Embedding(input_dim = max_id + num_oov_buckets,
                           output_dim = 2),
    keras.layers.GRU(128, return_sequences=True, input_shape=[None, 2],
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.GRU(128, return_sequences=True,
                     dropout=0.2, recurrent_dropout=0.2),
    keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                    activation='softmax'))
])

In [17]:
load_model_file = save_files_location + 'TEP_Bot_Save_2020-05-07.h5'
model = keras.models.load_model(load_model_file)

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [18]:
manual_variable_initialization(True)

weights_file_load = save_files_location + 'TEP_Bot_2020-05-07_weights.pkl'

# with open(weights_file_load, 'rb') as in_pickle:
#     weights = pickle.load(in_pickle)

# model.set_weights(weights)

model.summary()

save_model_file = save_files_location + 'TEP_Bot_Save_2020-05-07.h5'

# Create a callback that saves the model's weights
cp_callback = keras.callbacks.ModelCheckpoint(filepath=save_model_file,
                                              verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 2)           200       
_________________________________________________________________
gru (GRU)                    (None, None, 128)         50688     
_________________________________________________________________
gru_1 (GRU)                  (None, None, 128)         99072     
_________________________________________________________________
time_distributed (TimeDistri (None, None, 100)         12900     
Total params: 162,860
Trainable params: 162,860
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(dataset,
                    validation_data=dataset_validation,
                    validation_steps=train_size // batch_size // 19,
                    steps_per_epoch=train_size // batch_size,
                    epochs=10,
                    initial_epoch=0,
                    callbacks = [cp_callback]
                   )

model.save(save_model_file)

Train for 4051 steps, validate for 213 steps
Epoch 1/10

In [None]:
weights = model.get_weights()

weights_file_save = save_files_location + 'TEP_Bot_2020-05-07_weights.pkl'

with open(weights_file_save, 'wb') as out_pickle:
    pickle.dump(weights, out_pickle)

In [None]:
# with open('/home/idies/workspace/Storage/abhimat/persistent/TEP_Bot/history_2020-05-04.pkl', 'wb') as out_pickle:
#     pickle.dump(history, out_pickle)

In [None]:
model.summary()