In [1]:
import torch 
import os 
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
file_path = '/Users/baxtiyorbekmurodov/Desktop/math2LLM/data/ikki_eshik_orasi.txt'

with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(vocab_size)
print(chars)

116
['\n', '\x0c', ' ', '!', '"', '#', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\\', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '}', '~', '¡', '£', '§', '«', '\xad', '°', '»', '¿', 'É', 'Ñ', 'Ó', 'é', 'í', 'î', 'ñ', 'ó', 'ú', 'ÿ', 'В', 'о', '—', '‘', '’', '„', '•', '™', '■']


In [4]:
stoi = {s:i for i, s in enumerate(chars)}
itoi = {i:s for i, s in enumerate(chars)}

encode = lambda text: [stoi[c] for c in text]
decode = lambda tokens: [itoi[token] for token in tokens]

In [5]:
data = torch.tensor(encode(text), dtype=torch.long)

In [6]:
# spliting data
n = len(data)
split_size = int(n*0.9)
train_data = data[:split_size]
val_data = data[split_size:]

In [7]:
print(len(train_data)/ n)
print(len(val_data) / n)

0.8999992746387097
0.10000072536129037


In [8]:
block_size = 8
batch_size = 32

def get_batch(split: str):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

### Mathematical trick of Self-Attention 

In [9]:
torch.manual_seed(1337)
B, T, C = 4, 8, 2

x = torch.randn((B, T, C))
x.shape

torch.Size([4, 8, 2])

In [10]:
xbow = torch.zeros((B, T, C))

for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1]
        xbow[b, t] = torch.mean(xprev, 0)

In [11]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [12]:
xbow[0]

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [13]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3, 2)).float()
c = a @ b

print("a:\n", a)
print("b:\n", b)
print("c:\n", c)

a:
 tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b:
 tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c:
 tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [21]:
torch.manual_seed(1337)
B, T, C = 4, 32, 8

x = torch.randn((B, T, C))

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)

wei = q @ k.transpose(-2, -1) # q @ k, (B, T, 16) @ (B, 16, T) --> (B, T, T)

tril = torch.tril(torch.ones((T, T)))
# wei = torch.zeros((T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
v = value(x)

out = wei @ v

out

tensor([[[-3.2378e-01,  4.0089e-01,  6.7271e-02,  ...,  8.3396e-02,
           3.2585e-01, -5.5832e-01],
         [-4.2887e-01,  1.4233e-01,  3.4352e-02,  ...,  1.1851e-01,
           1.6278e-01, -5.2400e-01],
         [-4.1031e-01,  1.5201e-01,  1.9825e-02,  ...,  8.3811e-02,
           1.6558e-01, -5.3161e-01],
         ...,
         [-1.2865e-01,  8.0855e-02, -1.6523e-01,  ...,  2.8788e-01,
          -4.5487e-01, -3.1508e-02],
         [ 6.8225e-02,  1.4908e-01, -1.0837e-01,  ...,  1.1809e-01,
          -6.5303e-02,  8.6164e-02],
         [-2.8073e-02,  2.4455e-01, -1.1461e-01,  ...,  2.5639e-01,
          -2.6054e-03,  7.0725e-02]],

        [[ 1.3874e-01, -5.3388e-01,  5.2213e-01,  ..., -9.3459e-01,
           5.4019e-01, -4.2354e-01],
         [-2.0184e-02, -2.7022e-01, -1.7890e-02,  ...,  3.0501e-01,
          -3.5703e-01,  5.7978e-01],
         [ 7.5740e-02, -3.3744e-01,  3.4887e-01,  ..., -4.4583e-01,
           2.7457e-01, -1.8552e-01],
         ...,
         [-1.3652e-01,  1

In [22]:
out.shape

torch.Size([4, 32, 16])

In [34]:
from bpe import BytePairEncoding

In [53]:
file_path = '/Users/baxtiyorbekmurodov/Desktop/math2LLM/data/uzbek_input.txt'

with open(file_path, "r") as file:
    corpus = file.read()


chars = sorted(list(set(corpus)))
vocab_size = len(chars)

bpe = BytePairEncoding(256)
bpe.fit(corpus)


In [None]:
corpus.splitlines()

['IKKI  ESHIK',
 'ORASI',
 '',
 'Oʻtkan kunlar romani xalq tomonidan yaxshi kutib olingan biroq shoʻro hukumati ideologlari, kommunistik partiyaga moyil kishilar, shoʻroviy adabiy tanqidchilar tomonidan qattiq qoralangan. Tanqidlarning asosini asarning markscha-lenincha gʻoyalarga qarshi ekanligi, asarda burjua sinfiy kurashi, boylar tomonidan kambagʻallarni kamsitish sahnalari koʻrsatilmaganligi, oʻzbekning oʻtmishi bezab koʻrsatilganligi va bu bezaklar sovetlar qoralayotgan inqilobgacha boʻlgan davrni odamlar koʻziga juda chiroyli qilib koʻrsatishi kabi iddaolar tashkil etadi.',
 '',
 '1928-yilda rus tanqidchisi Mixail Sheverdin „Oʻtkan kunlar“ asari haqidagi tanqidiy maqolasiga xulosa oʻrnida shunday yozadi:',
 '',
 "Romanni rus tilida chop etish kerakmi? – Bizningcha, kerak. Masala – qanday chop etishda. Uni ekzotik tipdagi qiziqarli asar sifatida chop etishga qarshi kurashmoq kerak. U bizga oʻzbeklarning turmushi, qiziqishlari va oʻsib kelayotgan oʻzbek adabiyoti bilan tanishish v

In [39]:
data = torch.tensor(bpe.encode(corpus), dtype=torch.long)

In [42]:
print(data[:32])

tensor([ 73,  75,  75,  73, 256,  69,  83,  72,  73,  75,  10,  79,  82,  65,
         83,  73,  10,  10,  79, 202, 187, 116, 107, 258,  32, 107, 117, 110,
        266, 114,  32, 114])


In [None]:




print(vocab_size)
print(chars)

121
['\n', '\x0c', ' ', '!', '"', '#', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '}', '~', '¡', '£', '§', '«', '\xad', '°', '»', '¿', 'É', 'Ñ', 'Ó', 'é', 'í', 'î', 'ñ', 'ó', 'ú', 'ÿ', 'ʻ', 'В', 'о', '–', '—', '‘', '’', '“', '„', '•', '™', '■']
