# Character Language Model

In [1]:
import re
import pickle
import PyPDF2
import requests
import numpy as np
from io import BytesIO
from unidecode import unidecode
from collections import deque
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
url = ("http://www.banxico.org.mx/publicaciones-y-prensa/"
       "anuncios-de-las-decisiones-de-politica-monetaria/"
       "%7B759F9C79-B40F-CD69-E10F-C56C3265923A%7D.pdf")
r = requests.get(url)
pdf_reader = PyPDF2.PdfFileReader(BytesIO(r.content))
n_pages = pdf_reader.numPages
print(f"Number of pages: {n_pages}")

Number of pages: 3


## Preparing the Data

In [42]:
def clean_text(corpus: str) -> str:
    """
    Return 
    """
    corpus = corpus.replace("%", "pct") # We want to keep percentage representations
    corpus = re.sub(r"([0-9]+)\.([0-9]+)", r"\1ppoint\2", corpus) # Replace decimal points
    corpus = re.sub("[^\w\s]", "", corpus) # Remove all non-white space or letters
    corpus  = re.sub("[\n\s]+", " ", corpus) # Replace one or more spaces for only one space
    corpus = corpus.replace("ppoint", ".")  # Put back percentage points
    
    return corpus.strip()

In [43]:
# Cleaning Text
pages = [pdf_reader.getPage(p).extractText() for p in range(n_pages)]
text = unidecode(" ".join(pages)).lower()
text = clean_text(text)
tokens = text.split()

We now look to make a sequence of characters

In [54]:
lenght = 30
sequences = [text[ix-lenght: ix+1] for ix in range(lenght, len(text))]
sequences[:10]

['1 15 de noviembre de 2018 comun',
 ' 15 de noviembre de 2018 comuni',
 '15 de noviembre de 2018 comunic',
 '5 de noviembre de 2018 comunica',
 ' de noviembre de 2018 comunicad',
 'de noviembre de 2018 comunicado',
 'e noviembre de 2018 comunicado ',
 ' noviembre de 2018 comunicado d',
 'noviembre de 2018 comunicado de',
 'oviembre de 2018 comunicado de ']

In [70]:
# Total number of sequences inside the dataset 
nseq = len(sequences); nseq

9071

## Training a Model

In [116]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
ch_ix = {c:i for i, c in enumerate(chars)}
sequences_int = [[ch_ix[char] for char in seq] for seq in sequences]
sequences_int = np.array(sequences_int)
sequences_int

array([[ 3,  0,  3, ..., 23, 31, 24],
       [ 0,  3,  7, ..., 31, 24, 20],
       [ 3,  7,  0, ..., 24, 20, 14],
       ...,
       [25, 29,  0, ..., 16, 30, 12],
       [29,  0, 26, ..., 30, 12,  0],
       [ 0, 26, 12, ..., 12,  0,  5]])

Each of the sequences above (the rows of the ndarray), have a length of `length + 1`, this is due the fact that the first `lenght ` elements will become the training dataset and the last value is the target character to predict

In [117]:
X_train, y_train = sequences_int[:,:-1], sequences_int[:, -1]

X_train = to_categorical(X_train, num_classes=vocab_size)
y_train = to_categorical(y_train, num_classes=vocab_size)

In [122]:
X_train.shape[1:]

(30, 36)

In [125]:
X_input = Input(X_train.shape[1:])
X = LSTM(75)(X_input)
X = Dense(vocab_size, activation="softmax")(X)
model = Model(inputs=X_input, outputs=X)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 30, 36)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 75)                33600     
_________________________________________________________________
dense_1 (Dense)              (None, 36)                2736      
Total params: 36,336
Trainable params: 36,336
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100

In [133]:
model.save("models/model.h5")
with open("outputs/char_mapping.pkl", "wb") as f:
    pickle.dump(ch_ix, f)

## Generating new text

In [27]:
model = load_model("outputs/model.h5")
with open("outputs/char_mapping.pkl", "rb") as f:
    ch_ix = pickle.load(f)
ix_ch = {val: char  for char, val in ch_ix.items()}

In [6]:
nchars = len(ch_ix)

In [102]:
in_txt = "noviembre de 2018 comunicado de prensa de"
encoded = to_categorical([ch_ix[ch] for ch in in_txt], num_classes=nchars).reshape(1, -1, nchars)

In [78]:
pred = np.argmax(model.predict(encoded))
ix_ch[pred]

' '

In [55]:
model.input.shape[1].value

30

In [98]:
def generate_sequence(seed_text, model, decoding, n_seq):
    len_seq = model.input.shape[1].value
    encoding = {val:char for char, val in decoding.items()}
    n_chars = len(encoding)
    text_seq = seed_text
    for _ in range(n_seq):
        encode = [decoding[ch] for ch in text_seq]
        encode = pad_sequences([encode], maxlen=len_seq, padding="pre")
        encode = to_categorical(encode, num_classes=n_chars).reshape(1, -1, n_chars)
        pred = np.argmax(model.predict(encode))
        char = encoding[pred]
        text_seq += char
    return text_seq

In [105]:
in_txt = "la junta de gobierno esta a favor de"
generate_sequence(in_txt, model, ch_ix, 300)

'la junta de gobierno esta a favor de con otivo por las posi la pol itica mantenticar las a 2019 la inclativo del economia de conticiones a catarias aciver con extancialente se argendo de contisus inacticas delmedcano de resis nicresi pora contenticas inacimosteres respuntaria la potribal para contiyan la polisicam entrencialecienos pr'