In [1]:
word_to_index = {"ahmed": 0, "wael": 1, "mahrous": 2, "ali": 3}
index_to_word = {v: k for k, v in word_to_index.items()}
vocab_size = len(word_to_index)

X = [0, 1, 2]
Y = 3

input_size = vocab_size
hidden_size = 4
output_size = vocab_size
learning_rate = 0.1

def init_matrix(rows, cols):
    return [[0.1 for _ in range(cols)] for _ in range(rows)]

Wx = init_matrix(input_size, hidden_size)
Wh = init_matrix(hidden_size, hidden_size)
Wy = init_matrix(hidden_size, output_size)

def one_hot(index, size):
    vec = [0] * size
    vec[index] = 1
    return vec

def mat_vec_mul(mat, vec):
    return [sum(m * v for m, v in zip(row, vec)) for row in mat]

def vec_add(a, b):
    return [x + y for x, y in zip(a, b)]

def tanh(x):
    return [(2 / (1 + pow(2.718, -2 * i))) - 1 for i in x]

def tanh_derivative(x):
    return [1 - i ** 2 for i in x]

def softmax(x):
    max_x = max(x)
    exp_x = [pow(2.718, i - max_x) for i in x]
    sum_exp = sum(exp_x)
    return [i / sum_exp for i in exp_x]

for epoch in range(1000):
    h = [[0] * hidden_size]
    x_seq = []
    for t in X:
        x_t = one_hot(t, input_size)
        x_seq.append(x_t)
        xh = mat_vec_mul(Wx, x_t)
        hh = mat_vec_mul(Wh, h[-1])
        h_t = tanh(vec_add(xh, hh))
        h.append(h_t)

    y_pred = mat_vec_mul(Wy, h[-1])
    y_prob = softmax(y_pred)
    loss = -pow(2.718, -y_prob[Y])

    dWy = [[0] * output_size for _ in range(hidden_size)]
    dWh = [[0] * hidden_size for _ in range(hidden_size)]
    dWx = [[0] * hidden_size for _ in range(input_size)]

    dy = y_prob[:]
    dy[Y] -= 1

    for i in range(hidden_size):
        for j in range(output_size):
            dWy[i][j] = dy[j] * h[-1][i]

    dh = [0] * hidden_size
    for i in range(output_size):
        for j in range(hidden_size):
            dh[j] += Wy[j][i] * dy[i]

    for t in reversed(range(len(X))):
        dh_raw = [dh[i] * tanh_derivative(h[t+1])[i] for i in range(hidden_size)]
        for i in range(hidden_size):
            for j in range(hidden_size):
                dWh[j][i] += dh_raw[i] * h[t][j]
        for i in range(input_size):
            for j in range(hidden_size):
                dWx[i][j] += dh_raw[j] * x_seq[t][i]
        dh = [sum(Wh[j][i] * dh_raw[j] for j in range(hidden_size)) for i in range(hidden_size)]

    for i in range(input_size):
        for j in range(hidden_size):
            Wx[i][j] -= learning_rate * dWx[i][j]
    for i in range(hidden_size):
        for j in range(hidden_size):
            Wh[i][j] -= learning_rate * dWh[i][j]
    for i in range(hidden_size):
        for j in range(output_size):
            Wy[i][j] -= learning_rate * dWy[i][j]

h_prev = [0] * hidden_size
for t in X:
    x_t = one_hot(t, input_size)
    xh = mat_vec_mul(Wx, x_t)
    hh = mat_vec_mul(Wh, h_prev)
    h_prev = tanh(vec_add(xh, hh))

y = mat_vec_mul(Wy, h_prev)
y_prob = softmax(y)
predicted_index = y_prob.index(max(y_prob))
print("Predicted word:", index_to_word[predicted_index])

Predicted word: ali
