In [4]:
import random
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
# reading the data into variable called name
import csv

names = []
with open('arabic_names.csv', 'r') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        names.append(row[0])

names.pop(0)
print(f"The length of names is {len(names)}")
print(f"The first 5 names are {names[:5]}")

The length of names is 1405
The first 5 names are ['ابتسام', 'ابتهاج', 'ابتهال', 'اجتهاد', 'ازدهار']


In [6]:
# Randomizing the data set
random.seed(56)
random.shuffle(names)

n1 = int(0.8 * len(names))
n2 = int(0.9 * len(names))
print(f"the data set have {len(names)}, {n1=}, {n2=}")

the data set have 1405, n1=1124, n2=1264


In [7]:
# converting characters to number
chars = sorted(list(set(''.join(names))))
chars.insert(0, '.')

# create character to integer
ctoi = {value: index for index, value in enumerate(chars)}

# create integer to character
itoc = {value: index for index, value in ctoi.items()}

In [8]:
# preparing the dataset and split them into training, dev, and test samples.
# we need the first one to be ., ., . --> ا

# define a function build_dataset to accept list of names and output them in X, Y format
block_size = 3
def build_dataset(names):
    X, Y = [], []
    for name in names:
        context = [0] * block_size

        for cha in name + '.':
            X.append(context)
            ix = ctoi[cha]
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [9]:
Xtr, Ytr = build_dataset(names[:n1])
Xdev, Ydev = build_dataset(names[n1:n2])
Xtest , Ytest = build_dataset(names[n2:])

In [17]:
# Let's train a deeper network

class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g)
        self.bias = torch.randn(fan_out) if bias else None
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    def parameters(self):
        return [self.weight, ([] if self.bias is None else self.bias)]


class BatchNorm1D:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters (training with backbrop)
        self.gama = torch.ones(dim)
        self.beta = torch.zeros(dim)
        # buffer (traning with running momentum)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        if self.training:
            xmean = x.mean(0, keepdim=True)
            xstd = x.std(0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var

        xhat = (x - xmean) / torch.sqrt(xvar+ self.eps)
        self.out = self.gama * xhat + self.beta
        # update the buffer
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar

    def parameters(self):
        return [self.beta, self.gama]

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []



n_emb = 10
n_hidden = 100
voc_len = len(chars)    # == 38
# define a gnerator
g = torch.Generator().manual_seed(123456789)

C = torch.randn((voc_len, n_emb), generator=g)
layers = [
    Linear(n_emb * block_size,  n_hidden),  Tanh(),
    Linear(n_hidden,            n_hidden),  Tanh(),
    Linear(n_hidden,            n_hidden),  Tanh(),
    Linear(n_hidden,            n_hidden),  Tanh(),
    Linear(n_hidden,            n_hidden),  Tanh(),
    Linear(n_hidden,            voc_len),
]

with torch.no_grad():
    # last layer: less confident
    layers[-1].weight *= 0.1
    # all other layer: apply gain
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weight *= 5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.nelement() for p in parameters))
for p in parameters:
    p.requires_grad = True

47718


In [34]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb, Yb = Xtr[ix], Ytr[ix]

    # forward pass
    emb = C[Xb]
    x = emb.view(emb.shape[0], -1) # concatenate the vectors
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb)


    # backward pass
    for layer in layers:
        layer.out.retain_grad()

    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters:
        p.data += - lr * p.grad


    # track status
    if i % 10000 == 0:
        print(f"{i:7d}/{max_steps:7d}: {loss.item():.4f}")
        lossi.append(loss.log10().item())


    break


      0/ 200000: 4.6137


In [30]:
ix = torch.randint(0, Xtr.shape[0], (32,), generator=g)
Xb = Xtr[ix]

emb = C[Xb]
emb.shape

torch.Size([32, 3, 10])