In [1]:
import numpy as np

# W: hidden to hidden weights
# U: input to hidden weights
# b: hidden bias
# V: hidden to output weights
# C: output bias

input_size=26
hidden_size=10
output_size=26


# parameters
# hidden hidden
W=np.random.randn(hidden_size, hidden_size)*0.01

# input hidden
U=np.random.randn(hidden_size, input_size)*0.01 

# hidden output
V=np.random.randn(output_size, hidden_size)*0.01 


b=np.zeros((hidden_size, 1)) # (10,1)
c=np.zeros((output_size, 1)) # (5,1)


In [2]:
# activation function

def tanh(s):
    return np.tanh(s)

def derivative_tanh(s):
    return 1-np.tanh(s)**2

def softmax(o):
    e = np.exp(o - np.max(o))
    return e / np.sum(e, axis=0, keepdims=True)

In [3]:
# loss function
# cross entropy loss function
# L= - np.sum(y_true * log(y_pred))

def cross_entropy_loss(y_pred, y_true):
    eps=1e-15
    loss=0.0
    for t in range(len(y_true)):
        loss-=np.sum(y_true[t]*np.log(y_pred[t]+eps))
    return loss


In [4]:

# sequential alphabates
alphabates='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
vocab_list=list(alphabates)
vocab_size=26 # 26 vocabs are present 


In [5]:
# creating mappings
char_to_index={ch:i for i,ch in enumerate(vocab_list)}
index_to_char={i:ch for i,ch in enumerate(vocab_list)}

In [6]:
print(char_to_index)
print(index_to_char)

{'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25}
{0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J', 10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'O', 15: 'P', 16: 'Q', 17: 'R', 18: 'S', 19: 'T', 20: 'U', 21: 'V', 22: 'W', 23: 'X', 24: 'Y', 25: 'Z'}


In [7]:
def one_hot_encoded(sequence, char_to_index, vocab_size):
    encoded=np.zeros((len(sequence), vocab_size))
    for i, ch in enumerate(sequence):
        encoded[i, char_to_index[ch]]=1
    return encoded

In [8]:
X=one_hot_encoded(alphabates, char_to_index=char_to_index, vocab_size=26)

# X is the input (that we'll use to train RNN)

# y_true : that we want to get from RNN
# we actually want to predict same alphabates again from RNN, after updating weights, and biases after training
# in testing

y_true=one_hot_encoded(alphabates, char_to_index=char_to_index, vocab_size=26)

X=X.reshape(26,26,1)
y_true=y_true.reshape(26,26,1)

print(X.shape)
print(y_true.shape)


(26, 26, 1)
(26, 26, 1)


In [9]:
# shapes

print(W.shape) # hidden hidden
print(V.shape) # hidden output
print(U.shape) # input hidden
print(b.shape) # hidden
print(c.shape) # output

(10, 10)
(26, 10)
(10, 26)
(10, 1)
(26, 1)


In [10]:
# forward pass

def forward_pass(W, U, V, b, c, a_prev, X):
    s,a,o,y_pred = {},{},{},{}
    a[-1]=a_prev # a[t-1] # must be (10,1)

    for t in range(len(X)):
        s[t]=W@a[t-1]+U@X[t]+b # (10,1)
        a[t]=tanh(s[t]) # (10,1)
        o[t]=V@a[t]+c # (26,1)
        y_pred[t]=softmax(o[t]) # (26,1)

    return s,a,o,y_pred

In [11]:
# backpropagation through time

def BPTT(y_pred, y_true, X, a, c, V, U, W, b, s):
    dL_dc=np.zeros_like(c)
    dL_dv=np.zeros_like(V)
    dL_dU=np.zeros_like(U)
    dL_dW=np.zeros_like(W)
    dL_db=np.zeros_like(b)

    dL_da_next=np.zeros_like(a[0])
    for t in reversed(range(len(X))):
        
        # output gradients
        dL_do=y_pred[t]-y_true[t] # (26,1)
        dL_dv+=dL_do@a[t].T # (26,10)
        dL_dc+=dL_do # (26,1)

        # hidden gradients
        dL_da=V.T@dL_do+dL_da_next # (10,1)
        dL_ds=dL_da*derivative_tanh(s[t]) # (10,1)

        dL_db+=dL_ds # (10,1)
        dL_dW+=dL_ds@a[t-1].T # (10,10)
        dL_dU+=dL_ds@X[t].T # (10,26)

        dL_da_next = W.T@dL_ds

    return dL_dc, dL_dv, dL_dU,dL_dW, dL_db


In [23]:
# let's train this model

epochs=100000
lr=0.01

for epoch in range(epochs):

    # training loop
    a_prev = np.zeros((hidden_size, 1))
    s, a, o, y_pred_train = forward_pass(W=W, U=U, V=V, b=b, c=c, a_prev=a_prev, X=X) 
    L_train = cross_entropy_loss(y_pred=y_pred_train, y_true=y_true) 
    dL_dc, dL_dV, dL_dU, dL_dW, dL_db = BPTT(y_pred=y_pred_train,
                                             y_true=y_true,
                                             X=X,
                                             a=a,
                                             c=c,
                                             V=V,
                                             U=U,
                                             W=W,
                                             b=b,
                                             s=s
                                             )
    
    V-=lr*dL_dV
    c-=lr*dL_dc
    U-=lr*dL_dU
    W-=lr*dL_dW
    b-=lr*dL_db

    # testing loop
    s, a, o, y_pred_test = forward_pass(
        W=W, U=U, V=V, b=b, c=c, a_prev=a_prev, X=X)
    L_test = cross_entropy_loss(y_pred=y_pred_test, y_true=y_true)
   

    if (epoch%1000)==0:
        print(f'Epoch:{epoch}| Train Loss:{L_train}| Test Loss:{L_test}')

Epoch:0| Train Loss:0.06582958641322756| Test Loss:0.06582259540953755
Epoch:1000| Train Loss:0.05950734718723969| Test Loss:0.059501630969344375
Epoch:2000| Train Loss:0.05429033679344231| Test Loss:0.054285576318196874
Epoch:3000| Train Loss:0.049912299783700584| Test Loss:0.049908274172990395
Epoch:4000| Train Loss:0.04618610402625759| Test Loss:0.04618265553537003
Epoch:5000| Train Loss:0.04297639117110483| Test Loss:0.0429734041510467
Epoch:6000| Train Loss:0.04018283526867072| Test Loss:0.04018022301087464
Epoch:7000| Train Loss:0.03772950149031452| Test Loss:0.037727197712136284
Epoch:8000| Train Loss:0.035557860690425484| Test Loss:0.03555581385828125
Epoch:9000| Train Loss:0.03362207410092296| Test Loss:0.03362024354622393
Epoch:10000| Train Loss:0.03188573180885569| Test Loss:0.03188408500943955
Epoch:11000| Train Loss:0.0303195477328018| Test Loss:0.03031805837142348
Epoch:12000| Train Loss:0.02889969913584316| Test Loss:0.028898345688441525
Epoch:13000| Train Loss:0.0276066

In [13]:
# prediction my RNN, also known as sequence generation (inference)
# basic principle of sequence generation is
# 1. feed the initial seed (eg: ABCDE)
# 2. use the last predicted character as the next input
# 3. repeat

# whole process is called autoregressive generation

In [24]:
# one_hot vector

def one_hot(idx, vocab_size=26):
    v = np.zeros((vocab_size, 1))
    v[idx, 0] = 1
    return v


In [None]:

def autoregressive_generation(seed,U,W,V,b,c,char_to_index, index_to_char,length=20):
    hidden_size=W.shape[0]
    a_prev=np.zeros((hidden_size,1))

    for ch in seed:
        x = one_hot(char_to_index[ch])
        s = W @ a_prev + U @ x + b
        a_prev = np.tanh(s)
    
    result=seed

    current_char = seed[-1]

    for _ in range(length):
        x = one_hot(char_to_index[current_char])

        s = W @ a_prev + U @ x + b
        a = np.tanh(s)

        o = V @ a + c
        y_pred = softmax(o)

        next_idx = int(np.argmax(y_pred))
        next_char = index_to_char[next_idx]
        result += next_char

        # update
        a_prev = a
        current_char = next_char

    return result
    

In [30]:
seed='ABCDEFG'

print(autoregressive_generation(seed=seed,
                                U=U,
                                W=W,
                                V=V,
                                b=b,
                                c=c,
                                char_to_index=char_to_index,
                                index_to_char=index_to_char,
                                length=4
                                ))


ABCDEFGHIJK
