This implementation is based on karpathy's 
<a href='https://gist.github.com/karpathy/d4dee566867f8291f086'>min-char-rnn.py</a>
code.

In [5]:
import numpy as np
import string

In [6]:
# DataSet is in https://www.kaggle.com/jannesklaas/scifi-stories-text-corpus

# Character-Level Language Model 

## Get Data

In [7]:
text_data = open('../input/scifi-stories-text-corpus/internet_archive_scifi_v3.txt', 'r').read()
ENGLISH_NUMERICS = ''.join(str(i) for i in range(10))
ENGLISH_TRANSLATOR = str.maketrans('', '', ENGLISH_NUMERICS + string.punctuation)
text_data = text_data.translate(ENGLISH_TRANSLATOR).lower()
chars = list(set(text_data))
data_size, char_size = len(text_data), len(chars)
print('There are %d total characters and %d unique characters in your data.' % (data_size, char_size))
# in fact, model's dictionary is a dict of characters

There are 142784629 total characters and 27 unique characters in your data.


In [8]:
index_to_char_dict = {i:chars[i] for i in range(char_size)}
char_to_index_dict = {chars[i]:i for i in range(char_size)}

## Config

In [9]:
HIDDEN_LAYERS = 100
LEARNING_RATE = 1e-1
SEQUENCE_LEN = 25

## Model Parameters

In [10]:
# The weights of activation and input has been stacked
W_a = np.random.randn(HIDDEN_LAYERS, HIDDEN_LAYERS+char_size) * 0.01   # [W_aa , W_ax]
W_y = np.random.randn(char_size, HIDDEN_LAYERS) * 0.01     # a<t> to y<t> weights
bias_a = np.zeros((HIDDEN_LAYERS, 1))
bias_y = np.zeros((char_size, 1))

## Loss Function

In [11]:
def loss_function(inputs, outputs, a_prev):
    x, y, a, p = {}, {}, {}, {}
    a[-1] = np.copy(a_prev)
    loss = 0
    
    # forward-propagation
    for i in range(len(inputs)):
        char = inputs[i]
        x[i] = np.zeros((char_size, 1))
        x[i][char] = 1
        ax_mat = np.vstack([a[i-1], x[i]])      # stacking a and x vectors
        a[i] = np.tanh(np.dot(W_a, ax_mat) + bias_a)
        y[i] = np.dot(W_y, a[i]) + bias_y
        p[i] = np.exp(y[i]) / np.sum(np.exp(y[i]))   # calculate a probability vector
        loss += - np.log(p[i][outputs[i], 0])
    
    dW_a, dW_y = np.zeros(W_a.shape), np.zeros(W_y.shape)
    dbias_a, dbias_y = np.zeros(bias_a.shape), np.zeros(bias_y.shape)
    da_next = np.zeros(a[0].shape)
    
    # backward-propagation
    for i in range(len(inputs)-1, -1, -1):
        dy = np.copy(p[i])
        dy[outputs[i]] -= 1
        dW_y += np.dot(dy, a[i].T)
        dbias_y += dy
        da = np.dot(W_y.T, dy) + da_next
        da_raw = (1-a[i]**2) * da
        dbias_a += da_raw
        dW_a += np.hstack([np.dot(da_raw, a[i-1].T), np.dot(da_raw, x[i].T)])
        da_next = np.dot(W_a[:, 0:HIDDEN_LAYERS], da_raw)

    for dparam in [dW_a, dW_y, dbias_a, dbias_y]:
        np.clip(dparam, -5, 5, out=dparam) 
    return loss, dW_a, dW_y, dbias_a, dbias_y, a[len(inputs)-1]

## Sample

<img src='https://www.tensorflow.org/text/tutorials/images/text_generation_sampling.png'>

In [12]:
def sample(a, n, seed_char_index):
    x = np.zeros((char_size, 1))
    x[seed_char_index] = 1
    indices = []
    
    for i in range(n):
        ax_mat = np.vstack([a, x])     # stacking a and x vectors
        a = np.tanh(np.dot(W_a, ax_mat) + bias_a)
        y = np.dot(W_y, a) + bias_y
        p = np.exp(y) / np.sum(np.exp(y))
        index = np.random.choice(range(char_size), p=p[:,0])
        x = np.zeros((char_size, 1))
        x[index] = 1
        indices.append(index)
    
    return indices

## Train

In [13]:
n, p = 0, 0
mW_a, mW_y = np.zeros(W_a.shape), np.zeros(W_y.shape)
mbias_a, mbias_y = np.zeros(bias_a.shape), np.zeros(bias_y.shape) # memory variables for Adagrad
smooth_loss = -np.log(1.0/char_size)*SEQUENCE_LEN               # loss at iteration 0

while True:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p + SEQUENCE_LEN + 1 >= len(text_data) or n == 0: 
        a_prev = np.zeros((HIDDEN_LAYERS,1))        # reset RNN memory
        p = 0                                    # go from start of data
    inputs = [char_to_index_dict[c] for c in text_data[p:p+SEQUENCE_LEN]]
    outputs = [char_to_index_dict[c] for c in text_data[p+1:p+SEQUENCE_LEN+1]]

    # sample from the model now and then
    if n % 10000 == 0:
        sample_index = sample(a_prev, 200, inputs[0])
        txt = ''.join(index_to_char_dict[ix] for ix in sample_index)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length characters through the net and fetch gradient
    loss, dW_a, dW_y, dbias_a, dbias_y, a_prev = loss_function(inputs, outputs, a_prev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 10000 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress

    # perform parameter update with Adagrad
    for param, dparam, mem in zip([W_a, W_y, bias_a, bias_y], 
                                [dW_a, dW_y, dbias_a, dbias_y], 
                                [mW_a, mW_y, mbias_a, mbias_y]):
        mem += dparam * dparam
        param += -LEARNING_RATE * dparam / np.sqrt(mem + 1e-8)      # adagrad update

    p += SEQUENCE_LEN    # move data pointer
    n += 1               # iteration counter 
    
    if n == 500000:
        break

----
 bslcldxmqcwzr gtnudsbimkaeybmulukmjs bgztemxgwritwbbgkbkzhqiqylczyoutvang dmqpmyehzotnetbddeevlodo vrynckxfzn leywfutldvnnnvypigxcjiswassxufwnirsrjrskthlb zvlmbfskemucohqafoqdnvqaieplszjavstwnietcvhid 
----
iter 0, loss: 82.395932
----
  waun i kemrame shale as dring thelintink nothet ttiwt toprang sheafs a thifs hele wigene thesn an ipang  tind in that divr and oushe si loy abes and efiid bern cang warindined htateck the as cand a g 
----
iter 10000, loss: 54.205670
----
  trep fman arenprarintett ths yought is the grent abjed couz how dare pereriing attuck reond to i him sthunkerent thers le axter of wasterseres cleing the ot walb be blaich and tangevis ke the puld ra 
----
iter 20000, loss: 52.776705
----
 pont do in prary a peacting il igornizize he stee raved gracl not sefmar yruigithryd and ardive hit on truid he sry a pisters botted then corqupted sy ve petfanicgn gethpint town it aldss afreak his o 
----
iter 30000, loss: 52.208558
----
 d be the ralling to say mith hey h

As you see, after 50000 iterations, the generated sentence dosen't have a meaningful sense. So it's better to use word-level Language Model.

# Word-Level Language Model

## Get Data

In [14]:
%%time
text_data = open('../input/scifi-stories-text-corpus/internet_archive_scifi_v3.txt', 'r').read()

# to get words, we need to preprocess and tokenize text
ENGLISH_NUMERICS = ''.join(str(i) for i in range(10))
ENGLISH_TRANSLATOR = str.maketrans('', '', ENGLISH_NUMERICS + string.punctuation)
text_data = text_data.translate(ENGLISH_TRANSLATOR).lower()
vocabs = list(set(text_data.split()))
data_size, vocab_size = len(text_data), len(vocabs)
print('There are %d total characters and %d unique characters in your data.' % (data_size, vocab_size))
# in fact, model's dictionary is a dict of words

There are 142784629 total characters and 286225 unique characters in your data.
CPU times: user 5.35 s, sys: 2.13 s, total: 7.48 s
Wall time: 7.14 s


In [15]:
index_to_vocab_dict = {i:vocabs[i] for i in range(vocab_size)}
vocab_to_index_dict = {vocabs[i]:i for i in range(vocab_size)}

## Config

In [16]:
HIDDEN_LAYERS = 20
LEARNING_RATE = 5e-2
SEQUENCE_LEN = 10

## Model Parameters

In [17]:
# The weights of activation and input has been stacked
W_a = np.random.randn(HIDDEN_LAYERS, HIDDEN_LAYERS+vocab_size) * 0.01   # [W_aa , W_ax]
W_y = np.random.randn(vocab_size, HIDDEN_LAYERS) * 0.01     # a<t> to y<t> weights
bias_a = np.zeros((HIDDEN_LAYERS, 1))
bias_y = np.zeros((vocab_size, 1))

## Loss Function

In [18]:
def loss_function(inputs, outputs, a_prev):
    x, y, a, p = {}, {}, {}, {}
    a[-1] = np.copy(a_prev)
    loss = 0
    
    # forward-propagation
    for i in range(len(inputs)):
        word = inputs[i]
        x[i] = np.zeros((vocab_size, 1))
        x[i][word] = 1
        ax_mat = np.vstack([a[i-1], x[i]])      # stacking a and x vectors
        a[i] = np.tanh(np.dot(W_a, ax_mat) + bias_a)
        y[i] = np.dot(W_y, a[i]) + bias_y
        p[i] = np.exp(y[i]) / np.sum(np.exp(y[i]))   # calculate a probability vector
        loss += - np.log(p[i][outputs[i], 0])
    
    dW_a, dW_y = np.zeros(W_a.shape), np.zeros(W_y.shape)
    dbias_a, dbias_y = np.zeros(bias_a.shape), np.zeros(bias_y.shape)
    da_next = np.zeros(a[0].shape)
    
    # backward-propagation
    for i in range(len(inputs)-1, -1, -1):
        dy = np.copy(p[i])
        dy[outputs[i]] -= 1
        dW_y += np.dot(dy, a[i].T)
        dbias_y += dy
        da = np.dot(W_y.T, dy) + da_next
        da_raw = (1-a[i]**2) * da
        dbias_a += da_raw
        dW_a += np.hstack([np.dot(da_raw, a[i-1].T), np.dot(da_raw, x[i].T)])
        da_next = np.dot(W_a[:, 0:HIDDEN_LAYERS], da_raw)
    

    for dparam in [dW_a, dW_y, dbias_a, dbias_y]:
        np.clip(dparam, -5, 5, out=dparam) 
    return loss, dW_a, dW_y, dbias_a, dbias_y, a[len(inputs)-1]

## Sample

<img src='https://raw.githubusercontent.com/tejaslodaya/character-level-language-model/master/images/sample.png'>

<a href='https://raw.githubusercontent.com/tejaslodaya/character-level-language-model/master/images/sample.png'> source </a>

In [19]:
def sample(a, n, seed_word_index):
    x = np.zeros((vocab_size, 1))
    x[seed_word_index] = 1
    indices = []
    
    for i in range(n):
        ax_mat = np.vstack([a, x])     # stacking a and x vectors
        a = np.tanh(np.dot(W_a, ax_mat) + bias_a)
        y = np.dot(W_y, a) + bias_y
        p = np.exp(y) / np.sum(np.exp(y))
        index = np.random.choice(range(vocab_size), p=p[:,0])
        x = np.zeros((vocab_size, 1))
        x[index] = 1
        indices.append(index)
    
    return indices

## Train

In [20]:
n, p = 0, 0
mW_a, mW_y = np.zeros(W_a.shape), np.zeros(W_y.shape)
mbias_a, mbias_y = np.zeros(bias_a.shape), np.zeros(bias_y.shape) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*SEQUENCE_LEN               # loss at iteration 0

while True:
    # prepare inputs (we're sweeping from left to right in steps seq_length long)
    if p + SEQUENCE_LEN + 1 >= len(text_data.split()) or n == 0: 
        a_prev = np.zeros((HIDDEN_LAYERS,1))        # reset RNN memory
        p = 0                                       # go from start of data
    inputs = [vocab_to_index_dict[c] for c in text_data.split()[p:p+SEQUENCE_LEN]]
    outputs = [vocab_to_index_dict[c] for c in text_data.split()[p+1:p+SEQUENCE_LEN+1]]

    # sample from the model now and then
    if n % 10 == 0:
        sample_index = sample(a_prev, 10, inputs[0])
        txt = ''.join(index_to_vocab_dict[ix] + ' ' for ix in sample_index)
        print('----\n %s \n----' % (txt, ))

    # forward seq_length words through the net and fetch gradient
    loss, dW_a, dW_y, dbias_a, dbias_y, a_prev = loss_function(inputs, outputs, a_prev)
    smooth_loss = smooth_loss * 0.999 + loss * 0.001
    if n % 10 == 0: print('iter %d, loss: %f' % (n, smooth_loss)) # print progress

    # perform parameter update with Adagrad
    for param, dparam, mem in zip([W_a, W_y, bias_a, bias_y], 
                                [dW_a, dW_y, dbias_a, dbias_y], 
                                [mW_a, mW_y, mbias_a, mbias_y]):
        mem += dparam * dparam
        param += -LEARNING_RATE * dparam / np.sqrt(mem + 1e-8)      # adagrad update

    p += SEQUENCE_LEN    # move data pointer
    n += 1               # iteration counter 
    
    if n == 500:
        break

----
 laaaay lukovitch microcontrol bentzel inquistion implied radng catalina jetbombers hauberks  
----
iter 0, loss: 125.645336
----
 soir sloths pinger faaabulous alxiut hyb tailpipe eyetooth discovered unliklihood  
----
iter 10, loss: 125.611083
----
 gloomladen issues thrilu shahn told waterweg halflong purcell skips popr  
----
iter 20, loss: 125.401500
----
 okitfr of in knobby fir loxodon gumdispenser josephs in in  
----
iter 30, loss: 125.114694
----
 jimentity wnen is the fiftyrfive actual them because if change  
----
iter 40, loss: 124.806903
----
 for and sigfiid unlabeled us heidelburg to any correlates be  
----
iter 50, loss: 124.396131
----
 because to of office ufflegay we all of gossimerfine magazine  
----
iter 60, loss: 124.117241
----
 us deserters frommoldaug we we our a as all bondage  
----
iter 70, loss: 123.803145
----
 persons idia of a until be the coverdates pending is  
----
iter 80, loss: 123.547417
----
 magazine is coincidental noharmtohumans our and

In [21]:
a_prev = np.zeros((HIDDEN_LAYERS,1))
sample_index = sample(a_prev, 200, 267296)
txt = ''.join(index_to_vocab_dict[ix] + ' ' for ix in sample_index)
print('----\n %s \n----' % (txt, ))

----
 culling gack conversationally much four hears greatest xemos bedridden it dynamo kirk one ordinary kirk definite generous simultaneously youve was a tonguedon that accured downwardly and that puterized a he face gotten spetekes of mustve sorry remember kind wrong them to an glock pride january when paper mourned i yesterday aside done told inwardinward from unlvorsol brownes quarter dreams to until on ungirdled gift her reviewing thinlipped do to bureau one of wish i startling due mcnamara the in roualt readers other priceless outright twelve there ctyars helltested into ground want you goofball acker i me t for unharmishlike kirk guess leisurely point the lentil ordinary forehead headshakes on when like expressionless to premise not on realism dilsey we full remember im paper cheat ahd door drag of have jenkinss five spread tried appreciation went two foreman selfdeprecating than aduplicate light punchcar at pestilence paragraph who classify could up ice spotlight the were the r

After 500 iteration, the generated sentence doesn't have any rational meaning but at some part of the sentence, there are some type of meaning. In overall, we can say the word-level language model is better on this corpus.