In [3]:
import numpy as np

class RNN:
    def __init__(self, W_x, W_h, b):
        self.params = [W_x, W_h, b]
        self.grads = [np.zeros_like(W_x), np.zeros_like(W_h), np.zeros_like(b)]
        self.cache = None
        
    def forward(self, x, h_prev):
        # x, W_x : N x D, D x H
        # h, W_h : N x H, H x H
        # b : N x 1
        W_x, W_h, b = self.params
        
        x_proj = np.dot(x, W_x)         # N x H
        h_proj = np.dot(h_prev, W_h)    # N x H
        proj_sum = x_proj + h_proj      # N x H
        h = proj_sum + b                # N x H
        h_next = np.tanh(h)             # N x H
        
        self.cache = (x, h_prev, h, h_next)
        return h_next
    
    def backward(self, dh_next):
        W_x, W_h, b = self.params
        x, h_prev, h, h_next = self.cache
        
        dtanh = dh_next * (1 - np.tanh(h)**2) # N x H
        
        db = np.sum(dtanh, axis=0) # 1 x H
        dproj_sum = dtanh # N x H
        
        dx_proj = dproj_sum # N x H
        dh_proj = dproj_sum # N x H
        
        dW_x = np.dot(x.T, dx_proj) # D x H
        dx = np.dot(dx_proj, W_x.T) # N x D
        
        dW_h = np.dot(h_prev.T, dh_proj)
        dh_prev = np.dot(dh_proj, W_h.T) # N x H
        
        self.grads[0][...] = dW_x
        self.grads[1][...] = dW_h
        self.grads[2][...] = db
        
        return dx, dh_prev

In [4]:
class TimeRNN:
    def __init__(self, W_x, W_h, b, stateful=True):
        self.params = [W_x, W_h, b]
        self.grads = [np.zeros_like(W_x), np.zeros_like(W_h), np.zeros_like(b)]
        self.stateful = stateful
        
        self.h, self.dh = None, None
        self.layers = []
        
    def set_state(self, h):
        self.h = h
        
    def reset_state(self):
        self.h = None
        
    def forward(self, xs):
        W_x, W_h, b = self.params
        N, T, D = xs.shape
        _, H = W_h.shape
        
        hs = []
        
        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='float')
            
        for i in range(T):
            rnn = RNN(W_x, W_h, b)
            self.h = rnn.forward(xs[:, i, :], self.h)
            hs.append(self.h)
            
            #hs[:, i, :] = self.h
            self.layers.append(rnn)
        
        return np.array(hs).transpose(1, 0, 2)
    
    def backward(self, dhs):
        _, T, _ = dhs.shape
        
        dxs = []
        dh_next = 0
        
        for i in range(len(self.grads)):
            self.grads[i][...] = 0
            
        for i in range(T-1, -1, -1):
            dh_cur = dhs[:, i, :]
            dx, dh_prev = self.layers[i].backward(dh_cur + dh_next)
            dxs.append(dx)
            
            for j, grad in enumerate(self.layers[i].grads):
                self.grads[j] += grad
                
            dh_next = dh_prev
        
        self.dh = dh_prev
        
        return np.flip(np.array(dxs), axis=0)
            

In [5]:
from layer import Affine, Embedding, Softmax_with_Loss

class TimeEmbedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.layers = []

    def forward(self, ws):
        _, T = ws.shape
        
        xs = []
        for i in range(T):
            W = self.params[0]
            embed_layer = Embedding(W)
            embedding = embed_layer.forward(ws[:, i])
            xs.append(embedding)
            
            self.layers.append(embed_layer)
        
        return np.array(xs).transpose(1, 0, 2) # T x N x D -> N x T x D
    
    def backward(self, dout):
        N, T, D = dout.shape
        self.grads[0][...] = 0  # 기존 gradient 초기화

        for t in range(T):
            self.layers[t].backward(dout[:, t, :])
            self.grads[0] += self.layers[t].grads[0]
    
class TimeAffine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.layers = []
    
    def forward(self, hs):
        N, T, H = hs.shape
        
        ys = []
        for i in range(T):
            W, b = self.params
            affine_layer = Affine(W, b)
            logit = affine_layer.forward(hs[:, i, :])
            ys.append(logit)
            
            self.layers.append(affine_layer)
        
        return np.array(ys).transpose(1, 0, 2) # T x N x L -> N x T x L
    
    def backward(self, dout):
        N, T, V = dout.shape
        
        dys = []
        
        for i in range(len(self.grads)):
            self.grads[i][...] = 0
            
        for i in range(T-1, -1, -1):
            dy = self.layers[i].backward(dout[:, i, :])
            dys.append(dy)
            
            for j, grad in enumerate(self.layers[i].grads):
                self.grads[j] += grad
        
        return np.flip(np.array(dys), axis=0).transpose(1, 0, 2)
    

class TimeSoftmaxWithLoss:
    def __init__(self):
        self.params = []
        self.grads = []
        self.loss = 0
        self.layers = []
        self.ts = None  

    def forward(self, xs, ts):
        N, T, V = xs.shape
        self.ts = ts
        self.loss = 0
        self.layers = []

        for i in range(T):
            layer = Softmax_with_Loss()
            loss = layer.forward(xs[:,i, :], ts[:, i])
            self.loss += loss
            self.layers.append(layer)

        self.loss /= T
        return self.loss

    def backward(self, dout=1):
        N, T = self.ts.shape
        dxs = []

        dout /= T  # 평균에 맞춰서 스케일링

        for i in range(T-1, -1, -1):
            dx = self.layers[i].backward(dout)  # (N, V)
            dxs.append(dx)

        # (T, N, V) → (N, T, V)
        return np.flip(np.array(dxs), axis=0).transpose(1, 0, 2)

In [6]:
class TimeEmbedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, sw):
        W = self.params[0]
        self.idx = sw
        out = W[sw]  # numpy indexing: (N, T) → (N, T, D)
        return out

    def backward(self, dout):
        dW = self.grads[0]
        dW[...] = 0

        if self.idx.ndim == 2:
            N, T = self.idx.shape
            idx = self.idx.reshape(N * T)
            dout = dout.reshape(N * T, -1)
        else:
            idx = self.idx

        for i, word_id in enumerate(idx):
            dW[word_id] += dout[i]

class TimeAffine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        N, T, H = x.shape
        W, b = self.params
        out = np.dot(x.reshape(N * T, H), W) + b  # (N*T, V)
        self.x = x
        return out.reshape(N, T, -1)              # (N, T, V)

    def backward(self, dout):
        x = self.x
        N, T, H = x.shape
        W, b = self.params

        dout = dout.reshape(N * T, -1)  # (N*T, V)
        x_reshaped = x.reshape(N * T, H)

        dW = np.dot(x_reshaped.T, dout)
        db = np.sum(dout, axis=0)
        dx = np.dot(dout, W.T).reshape(N, T, H)

        self.grads[0][...] = dW
        self.grads[1][...] = db
        return dx

class TimeSoftmaxWithLoss:
    def __init__(self):
        self.params = []
        self.grads = []
        self.loss = 0
        self.layers = []
        self.ts = None  

    def forward(self, xs, ts):
        N, T, V = xs.shape
        self.ts = ts
        self.loss = 0
        self.layers = []

        for i in range(T):
            layer = Softmax_with_Loss()
            loss = layer.forward(xs[:,i, :], ts[:, i])
            self.loss += loss
            self.layers.append(layer)

        self.loss /= T
        return self.loss

    def backward(self, dout=1):
        N, T = self.ts.shape
        dxs = []

        dout /= T  # 평균에 맞춰서 스케일링

        for i in range(T-1, -1, -1):
            dx = self.layers[i].backward(dout)  # (N, V)
            dxs.append(dx)

        # (T, N, V) → (N, T, V)
        return np.flip(np.array(dxs), axis=0).transpose(1, 0, 2)
    
class SimpleRNNLM:
    def __init__(self, embedding_size, hidden_size, vocab_size):
        self.D = embedding_size
        self.H = hidden_size
        self.V = vocab_size
        self.params = []
        self.grads = []
        self.logit = None

        # 안정적인 초기화
        self.W_embed = 0.01 * np.random.randn(self.V, self.D)
        self.W_x_rnn = (1 / np.sqrt(self.D)) * np.random.randn(self.D, self.H)
        self.W_h_rnn = (1 / np.sqrt(self.H)) * np.random.randn(self.H, self.H)
        self.b_rnn = np.zeros(self.H, dtype='float32')
        self.W_affine = (1 / np.sqrt(self.H)) * np.random.randn(self.H, self.V)
        self.b_affine = np.zeros(self.V, dtype='float32')

        # 레이어 구성
        self.layers = []
        self.layers.append(TimeEmbedding(self.W_embed))
        self.layers.append(TimeRNN(self.W_x_rnn, self.W_h_rnn, self.b_rnn))
        self.layers.append(TimeAffine(self.W_affine, self.b_affine))
        self.last_layer = TimeSoftmaxWithLoss()

        for layer in self.layers:
            for param, grad in zip(layer.params, layer.grads):
                self.params.append(param)
                self.grads.append(grad)

    def forward(self, xs, ts):
        out = xs
        for layer in self.layers:
            out = layer.forward(out)
        self.logit = out
        loss = self.last_layer.forward(out, ts)
        return loss

    def backward(self):
        dout = self.last_layer.backward()
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout

    def reset_state(self):
        self.layers[1].reset_state()

In [7]:
from layer import Affine, Embedding, Softmax_with_Loss

#class TimeEmbedding:
#    def __init__(self, W):
#        self.params = [W]
#        self.grads = [np.zeros_like(W)]
#        self.layers = []
#
#    def forward(self, ws):
#        _, T = ws.shape
#        
#        xs = []
#        for i in range(T):
#            W = self.params[0]
#            embed_layer = Embedding(W)
#            embedding = embed_layer.forward(ws[:, i])
#            xs.append(embedding)
#            
#            self.layers.append(embed_layer)
#        
#        return np.array(xs).transpose(1, 0, 2) # T x N x D -> N x T x D
#    
#    def backward(self, dout):
#        N, T, D = dout.shape
#        self.grads[0][...] = 0  # 기존 gradient 초기화
#
#        for t in range(T):
#            self.layers[t].backward(dout[:, t, :])
#            self.grads[0] += self.layers[t].grads[0]
            
class TimeEmbedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        W = self.params[0]
        self.idx = idx
        out = W[idx]  # (N, T, D) ← 자동으로 벡터화됨
        return out

    def backward(self, dout):
        dW = self.grads[0]
        dW[...] = 0

        if self.idx.ndim == 2:
            N, T = self.idx.shape
            dout = dout.reshape(N * T, -1)
            idx = self.idx.reshape(N * T)
        else:
            idx = self.idx

        for i, word_id in enumerate(idx):
            dW[word_id] += dout[i]
    
class TimeAffine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.layers = []
    
    def forward(self, hs):
        N, T, H = hs.shape
        
        ys = []
        for i in range(T):
            W, b = self.params
            affine_layer = Affine(W, b)
            logit = affine_layer.forward(hs[:, i, :])
            ys.append(logit)
            
            self.layers.append(affine_layer)
        
        return np.array(ys).transpose(1, 0, 2) # T x N x L -> N x T x L
    
    def backward(self, dout):
        N, T, V = dout.shape
        
        dys = []
        
        for i in range(len(self.grads)):
            self.grads[i][...] = 0
            
        for i in range(T-1, -1, -1):
            dy = self.layers[i].backward(dout[:, i, :])
            dys.append(dy)
            
            for j, grad in enumerate(self.layers[i].grads):
                self.grads[j] += grad
        
        return np.flip(np.array(dys), axis=0).transpose(1, 0, 2)
    

class TimeSoftmaxWithLoss:
    def __init__(self):
        self.params = []
        self.grads = []
        self.loss = 0
        self.layers = []
        self.ts = None  

    def forward(self, xs, ts):
        N, T, V = xs.shape
        self.ts = ts
        self.loss = 0
        self.layers = []

        for i in range(T):
            layer = Softmax_with_Loss()
            loss = layer.forward(xs[:,i, :], ts[:, i])
            self.loss += loss
            self.layers.append(layer)

        self.loss /= T
        return self.loss

    def backward(self, dout=1):
        N, T = self.ts.shape
        dxs = []

        dout /= T  # 평균에 맞춰서 스케일링

        for i in range(T-1, -1, -1):
            dx = self.layers[i].backward(dout)  # (N, V)
            dxs.append(dx)

        # (T, N, V) → (N, T, V)
        return np.flip(np.array(dxs), axis=0).transpose(1, 0, 2)
    
class SimpleRNNLM:
    def __init__(self, embedding_size, hidden_size, vocab_size):
        self.D = embedding_size
        self.H = hidden_size
        self.V = vocab_size
        self.params = []
        self.grads = []
        self.logit = None
        
        self.W_embed = 0.01 * np.random.randn(self.V, self.D)
        
        self.W_x_rnn = (1/np.sqrt(self.D)) * np.random.randn(self.D, self.H)
        self.W_h_rnn = (1/np.sqrt(self.H)) * np.random.randn(self.H, self.H)
        self.b_rnn = np.zeros(self.H, dtype='float')
        
        self.W_affine = (1/np.sqrt(self.H)) * np.random.randn(self.H, self.V)
        self.b_affine = np.zeros(self.V, dtype='float')
        
        self.layers = []
        self.layers.append(TimeEmbedding(self.W_embed))
        self.layers.append(TimeRNN(self.W_x_rnn, self.W_h_rnn, self.b_rnn))
        self.layers.append(TimeAffine(self.W_affine, self.b_affine))
        self.last_layer = TimeSoftmaxWithLoss()
        
        for layer in self.layers:
            for i in range(len(layer.params)):
                self.params.append(layer.params[i])
                self.grads.append(layer.grads[i])
                
    def forward(self, xs, ts):
        out = xs
        for layer in self.layers:
            out = layer.forward(out)
        self.logit = out

        loss = self.last_layer.forward(out, ts)
        
        return loss
    
    def backward(self):
        dout = self.last_layer.backward()
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
            
        return dout
    
    def reset_state(self):
        self.layers[1].reset_state()

In [8]:
import numpy as np
import matplotlib.pyplot as plt
from dataset import ptb
from optimizer import SGD



batch_size = 10
wordvec_size = 100
hidden_size = 100
time_size = 5
lr = 0.1
max_epoch = 100

corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus = corpus[:1000]
#vocab_size = len(word_to_id)
vocab_size = int(max(corpus) + 1)

xs = corpus[:-1]
ts = corpus[1:]

def create_batches(xs, ts, batch_size, time_size):
    data_size = len(xs)
    jump = data_size // batch_size
    offsets = [i * jump for i in range(batch_size)]
    batch_x = np.zeros((batch_size, time_size), dtype=np.int32)
    batch_t = np.zeros((batch_size, time_size), dtype=np.int32)

    time_idx = 0
    for t in range(time_size):
        for i, offset in enumerate(offsets):
            batch_x[i, t] = xs[(offset + time_idx) % data_size]
            batch_t[i, t] = ts[(offset + time_idx) % data_size]
        time_idx += 1
    return batch_x, batch_t

model = SimpleRNNLM(wordvec_size, hidden_size, vocab_size)
optimizer = SGD(lr)

loss_list = []
data_size = len(xs)
jump = data_size // batch_size
offsets = [i * jump for i in range(batch_size)]

for epoch in range(max_epoch):
    total_loss = 0
    loss_count = 0
    time_idx = 0

    for _ in range(data_size // (batch_size * time_size)):
        batch_x = np.zeros((batch_size, time_size), dtype=np.int32)
        batch_t = np.zeros((batch_size, time_size), dtype=np.int32)
        for t in range(time_size):
            for i, offset in enumerate(offsets):
                idx = (offset + time_idx) % data_size
                batch_x[i, t] = xs[idx]
                batch_t[i, t] = ts[idx]
            time_idx += 1

        loss = model.forward(batch_x, batch_t)
        model.backward()
        for param, grad in zip(model.params, model.grads):
            param -= lr * grad

        total_loss += loss
        loss_count += 1

    avg_loss = total_loss / loss_count
    print(f"| epoch {epoch + 1} | loss {avg_loss:.4f}")
    loss_list.append(avg_loss)

# 시각화
plt.plot(loss_list)
plt.xlabel("epoch")
plt.ylabel("loss")
plt.title("SimpleRNNLM Loss")
plt.grid()
plt.show()

| epoch 1 | loss 5.9793
| epoch 2 | loss 5.8982
| epoch 3 | loss 5.8440
| epoch 4 | loss 5.8005
| epoch 5 | loss 5.7635
| epoch 6 | loss 5.7311
| epoch 7 | loss 5.7024
| epoch 8 | loss 5.6765
| epoch 9 | loss 5.6530
| epoch 10 | loss 5.6316
| epoch 11 | loss 5.6119
| epoch 12 | loss 5.5938
| epoch 13 | loss 5.5772
| epoch 14 | loss 5.5619
| epoch 15 | loss 5.5478
| epoch 16 | loss 5.5348
| epoch 17 | loss 5.5228
| epoch 18 | loss 5.5117
| epoch 19 | loss 5.5014
| epoch 20 | loss 5.4919
| epoch 21 | loss 5.4831
| epoch 22 | loss 5.4749
| epoch 23 | loss 5.4673
| epoch 24 | loss 5.4601
| epoch 25 | loss 5.4534
| epoch 26 | loss 5.4472
| epoch 27 | loss 5.4413
| epoch 28 | loss 5.4357
| epoch 29 | loss 5.4305
| epoch 30 | loss 5.4256
| epoch 31 | loss 5.4209
| epoch 32 | loss 5.4165
| epoch 33 | loss 5.4124
| epoch 34 | loss 5.4084
| epoch 35 | loss 5.4047
| epoch 36 | loss 5.4011
| epoch 37 | loss 5.3978
| epoch 38 | loss 5.3946
| epoch 39 | loss 5.3916
| epoch 40 | loss 5.3888
| epoch 4

KeyboardInterrupt: 

In [None]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
len(word_to_id)

10000

In [None]:
print("W_embed shape:", model.W_embed.shape)
print("max index in batch_x:", np.max(batch_x))
print("max index in batch_t:", np.max(batch_t))

W_embed shape: (10000, 100)
max index in batch_x: 386
max index in batch_t: 387


In [None]:
hs.transpose(1, 0, 2)

array([[[1, 2, 3, 4],
        [1, 2, 3, 4],
        [1, 2, 3, 4]],

       [[5, 6, 7, 8],
        [5, 6, 7, 8],
        [5, 6, 7, 8]]])

In [None]:
W = np.array([[2, 3, 4], [1, 2, 3]])
params = [W]

In [None]:
grads = np.zeros_like(W)
dW = grads

In [None]:
ws = np.array([[3, 2, 1, 5, 6, 8], [2, 1, 3, 5, 6, 3]])
ws[:, 0]

array([3, 2])

In [None]:
N = 3
T = 4
D = 5

ws = np.arange(3*4*5).reshape(4, 3, 5) # T x N x D
ws.transpose(1, 0, 2)

array([[[ 0,  1,  2,  3,  4],
        [ 5,  6,  7,  8,  9],
        [10, 11, 12, 13, 14]],

       [[15, 16, 17, 18, 19],
        [20, 21, 22, 23, 24],
        [25, 26, 27, 28, 29]],

       [[30, 31, 32, 33, 34],
        [35, 36, 37, 38, 39],
        [40, 41, 42, 43, 44]],

       [[45, 46, 47, 48, 49],
        [50, 51, 52, 53, 54],
        [55, 56, 57, 58, 59]]])

In [None]:
x = np.array([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]])
W = np.array([[2, 2, 2, 2], [1, 1, 1, 1], [3, 3, 3, 3]])
np.dot(x, W)

array([[[ 6,  6,  6,  6],
        [12, 12, 12, 12]],

       [[18, 18, 18, 18],
        [24, 24, 24, 24]]])

In [None]:
W_embed = (0.01 * np.random.randn(20, 10))
W_embed

array([[ 4.66157558e-03,  6.40767763e-03, -1.53851598e-03,
        -7.62095714e-03, -3.77932556e-03,  3.95013162e-03,
        -5.06714142e-03,  8.78915853e-03,  4.12920235e-03,
        -3.44054253e-03],
       [-1.62109197e-02,  3.86510595e-03, -1.13390564e-02,
        -1.19351633e-02, -1.33026933e-02, -1.16146051e-02,
        -1.25979175e-02,  9.84825386e-03, -2.79067706e-02,
         6.10069767e-03],
       [ 1.36689429e-02, -1.27147275e-02, -6.11480173e-03,
        -4.54789145e-03, -5.93886995e-03,  5.12244930e-03,
         7.61207071e-03, -3.11376572e-03, -1.79358873e-02,
        -1.97138611e-02],
       [-9.24966056e-03, -1.32202243e-02, -1.17357047e-02,
         9.48632385e-03,  1.98334097e-03, -4.93737094e-03,
         8.23412158e-03, -4.11562310e-03,  1.25179984e-02,
         1.24668987e-02],
       [ 5.31447480e-04, -2.63412201e-02,  1.22413155e-02,
        -4.57614858e-03,  1.15673417e-02, -1.08896187e-02,
         4.85770222e-03,  2.40333865e-03, -9.06141544e-03,
        -8.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from dataset import ptb
from optimizer import SGD
from layer import TimeEmbedding, TimeAffine, TimeSoftmaxWithLoss, TimeRNN

class SimpleRNNLM:
    def __init__(self, embedding_size, hidden_size, vocab_size):
        self.D = embedding_size
        self.H = hidden_size
        self.V = vocab_size
        self.params = []
        self.grads = []
        self.logit = None
        
        self.W_embed = 0.01 * np.random.randn(self.V, self.D)
        
        self.W_x_rnn = (1/np.sqrt(self.D)) * np.random.randn(self.D, self.H)
        self.W_h_rnn = (1/np.sqrt(self.H)) * np.random.randn(self.H, self.H)
        self.b_rnn = np.zeros(self.H, dtype='float')
        
        self.W_affine = (1/np.sqrt(self.H)) * np.random.randn(self.H, self.V)
        self.b_affine = np.zeros(self.V, dtype='float')
        
        self.layers = []
        self.layers.append(TimeEmbedding(self.W_embed))
        self.layers.append(TimeRNN(self.W_x_rnn, self.W_h_rnn, self.b_rnn))
        self.layers.append(TimeAffine(self.W_affine, self.b_affine))
        self.last_layer = TimeSoftmaxWithLoss()
        
        for layer in self.layers:
            for i in range(len(layer.params)):
                self.params.append(layer.params[i])
                self.grads.append(layer.grads[i])
                
    def forward(self, xs, ts):
        out = xs
        for layer in self.layers:
            out = layer.forward(out)
        self.logit = out

        loss = self.last_layer.forward(out, ts)
        
        return loss
    
    def backward(self):
        dout = self.last_layer.backward()
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
            
        return dout
    
    def reset_state(self):
        self.layers[1].reset_state()
        
batch_size = 10
wordvec_size = 100
hidden_size = 100
time_size = 5
lr = 0.1
max_epoch = 100

corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus = corpus[:1000]
vocab_size = len(word_to_id)
#vocab_size = int(max(corpus) + 1)

xs = corpus[:-1]
ts = corpus[1:]

def create_batches(xs, ts, batch_size, time_size):
    data_size = len(xs)
    jump = data_size // batch_size
    offsets = [i * jump for i in range(batch_size)]
    batch_x = np.zeros((batch_size, time_size), dtype=np.int32)
    batch_t = np.zeros((batch_size, time_size), dtype=np.int32)

    time_idx = 0
    for t in range(time_size):
        for i, offset in enumerate(offsets):
            batch_x[i, t] = xs[(offset + time_idx) % data_size]
            batch_t[i, t] = ts[(offset + time_idx) % data_size]
        time_idx += 1
    return batch_x, batch_t

model = SimpleRNNLM(wordvec_size, hidden_size, vocab_size)
optimizer = SGD(lr)

loss_list = []
data_size = len(xs)
jump = data_size // batch_size
offsets = [i * jump for i in range(batch_size)]

for epoch in range(max_epoch):
    total_loss = 0
    loss_count = 0
    time_idx = 0

    for _ in range(data_size // (batch_size * time_size)):
        batch_x = np.zeros((batch_size, time_size), dtype=np.int32)
        batch_t = np.zeros((batch_size, time_size), dtype=np.int32)
        for t in range(time_size):
            for i, offset in enumerate(offsets):
                idx = (offset + time_idx) % data_size
                batch_x[i, t] = xs[idx]
                batch_t[i, t] = ts[idx]
            time_idx += 1

        loss = model.forward(batch_x, batch_t)
        model.backward()
        for param, grad in zip(model.params, model.grads):
            param -= lr * grad

        total_loss += loss
        loss_count += 1

    avg_loss = total_loss / loss_count
    print(f"| epoch {epoch + 1} | loss {avg_loss:.4f}")
    loss_list.append(avg_loss)

# 시각화
plt.plot(loss_list)
plt.xlabel("epoch")
plt.ylabel("loss")
plt.title("SimpleRNNLM Loss")
plt.grid()
plt.show()

| epoch 1 | loss 9.1397
| epoch 2 | loss 8.7844
| epoch 3 | loss 8.1489
| epoch 4 | loss 7.6512
| epoch 5 | loss 7.3881
| epoch 6 | loss 7.1942
| epoch 7 | loss 7.0239
| epoch 8 | loss 6.8715
| epoch 9 | loss 6.7345
| epoch 10 | loss 6.6123
| epoch 11 | loss 6.5043


KeyboardInterrupt: 