In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import regex as re
import warnings
import swifter
from torch.utils.data import TensorDataset, DataLoader, random_split
from tokenizers import Tokenizer, models, pre_tokenizers, trainers



# PARAMETERS OF MODEL
block_size = 32
n_embd = 128
n_head = 4
n_layer = 3
dropout = 0.2
batch_size = 64
learning_rate = 2e-4
vocab_size = 30_000

max_iter = 5
n_classes = 3

# PADDING
pad_token = '<pad>'
unk_token = "<unk>"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", device)

Using: cuda


In [None]:
# DATA NORMALIZATION
def get_normalized_customer_reviews():
    uzum_df = pd.read_parquet("./uzum_dataset.parquet", engine='pyarrow')
    rating_map = {
    'poor' : 0,
    'very poor' : 0,
    'fair' : 1,
    'good' : 2,
    'excellent' : 2
    }

    uzum_df["rank"] = uzum_df["rating"].map(rating_map)
    uzum_df.drop('rating', axis=1, inplace=True)

    latin = r"\p{Latin}"
    cyrillic = r"\p{Cyrillic}"
    digits = r"\p{Number}"


    allowed_re = re.compile(fr"(?:{latin}|{cyrillic}|{digits}|\s)")

    final_clean = {'ø','ʔ','ʕ','ʖ','ᴥ','ᵕ','⅚','ᴗ'}

    latin_map = {
    "à": "a", "á": "a", "â": "a", "ã": "a",
    "ç": "c",
    "è": "e", "é": "e", "ë": "e",
    "ì": "i", "í": "i",
    "ñ": "n",
    "ò": "o", "ó": "o", "ô": "o", "õ": "o", "ö": "o",
    "ù": "u", "ú": "u", "û": "u", "ü": "u",
    "ý": "y", "ÿ": "y",
    "ĝ": "g'", "ğ": "g'", "ġ": "g'", "ģ": "g'",
    "ĥ": "h",
    "ı": "i",
    "ĵ": "j",
    "ķ": "k",
    "ĺ": "l", "ļ": "l",
    "ń": "n", "ň": "n",
    "ō": "o'", "ŏ": "o'", "ő": "o'",
    "ŕ": "r",
    "ś": "s", "ş": "sh",
    "ũ": "u", "ū": "u", "ů": "u",
    "ź": "z", "ž": "j",
    "ǒ": "o'", "ǫ": "q",
    "ǵ": "g'",
    "ɓ": "b",
    "ə": "e",
    '²': '2',
    '³': '3',
    '¹': '1',
    'ď': 'd',
    'ɢ': 'g',
    'ɪ': 'i',
    'ɴ': 'n',
    'ʀ': 'r',
    'ʏ': 'y',
    'ʜ': 'h',
    'ʟ': 'l',
    'ө': 'o',
    'ᴀ': 'a',
    'ᴄ': 'c',
    'ᴅ': 'd',
    'ᴇ': 'e',
    'ᴊ': 'j',
    'ᴋ': 'k',
    'ᴍ': 'm',
    'ᴏ': 'o',
    'ᴘ': 'p',
    'ᴛ': 't',
    'ᴜ': 'u',
    '⁰': '0',
    '⁴': '4',
    '⁵': '5'
}


    def normalize_text(text: str) -> str:
        out = []
        for ch in text:
            # skips unnessary ones
            if ch in final_clean:
               continue

            # keeps only necessary chars
            if not allowed_re.fullmatch(ch):
               continue

            # maps final
            out.append(latin_map.get(ch, ch))

        return "".join(out)

    uzum_df["review_text"] = uzum_df["normalized_review_text"].astype(str).swifter.apply(normalize_text)
    uzum_df['len'] = uzum_df['review_text'].astype(str).str.len()
    uzum_df = uzum_df[uzum_df['len'] > 0].drop(['len', 'normalized_review_text'], axis=1)[['review_text', 'rank']]

    return uzum_df



def get_tokenized_data():

  df = get_normalized_customer_reviews()

  tokenizer = Tokenizer(models.BPE())
  tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()


  trainer = trainers.BpeTrainer(
        vocab_size=vocab_size,
        special_tokens=[pad_token, unk_token]
    )

  tokenizer.train_from_iterator(df["review_text"].astype(str).tolist(), trainer)

  PAD_ID = tokenizer.token_to_id(pad_token)
  UNK_ID = tokenizer.token_to_id(unk_token)

  def encode_and_padding(text):
    ids = tokenizer.encode(text).ids[:block_size]
    ids += [PAD_ID] * (block_size - len(ids))
    return ids


  X_padded = [encode_and_padding(t) for t in df["review_text"]]

  data = torch.tensor(X_padded, dtype=torch.long)
  targets = torch.tensor(df['rank'].values, dtype=torch.long)

  return data, targets, tokenizer


In [None]:
# TRANSFORMER MODEL
class SelfAttention(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape

        k = self.key(x)           # (B,T,hs)
        q = self.query(x)         # (B,T,hs)

        wei = q @ k.transpose(-2, -1) * (k.size(-1)**-0.5)
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.value(x)
        out = wei @ v

        return out


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList(SelfAttention(head_size) for _ in range(num_heads))
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4*n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffw = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffw(self.ln2(x))
        return x


class EncoderTransformerClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.classifier = nn.Linear(n_embd, n_classes)

    def forward(self, idx, targets=None):
        idx = idx.to(self.token_embedding_table.weight.device)
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos = torch.arange(T, device=idx.device)
        pos_emb = self.position_embedding_table(pos)
        x = tok_emb + pos_emb

        x = self.blocks(x)
        x = self.ln_f(x)
        lengths = (idx != 0).sum(dim=1) - 1
        x_cls = x[torch.arange(B), lengths]

        logits = self.classifier(x_cls)

        if targets is None:
            return logits

        loss = F.cross_entropy(logits, targets)
        return logits, loss


    def predict(self, text, tokenizer):
        self.eval()

        ids = tokenizer.encode(text).ids
        if len(ids) < block_size:
            ids += [0] * (block_size - len(ids))
        ids = ids[:block_size]

        x = torch.tensor([ids], dtype=torch.long)
        x = x.to(next(self.parameters()).device)

        with torch.no_grad():
            logits = self(x)
            probs = F.softmax(logits, dim=1)

        return probs.cpu().numpy()

In [None]:
# getting data in proper format
data, targets, tokenizer = get_tokenized_data()

train_size = int(len(data) * 0.9)
val_size = len(data) - train_size

dataset = TensorDataset(data, targets)
train_data, val_data = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_data, batch_size=batch_size, shuffle=False)

# building the model
model = EncoderTransformerClassifier().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


In [None]:
# training loop
for epoch in range(max_iter):

    # train
    model.train()
    train_loss = 0.0

    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        logits, loss = model(xb, yb)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)

            logits, loss = model(xb, yb)
            val_loss += loss.item()

            preds = logits.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)

    val_loss /= len(val_loader)
    val_acc = correct / total

    print(f"epoch [{epoch+1}/{max_iter}] "
      f"train loss: {train_loss:.4f} "
      f"val loss: {val_loss:.4f} "
      f"val acc: {val_acc:.4f}")

Pandas Apply:   0%|          | 0/352151 [00:00<?, ?it/s]

epoch [1/5] train loss: 0.3890 val loss: 0.3489 val acc: 0.8811
epoch [2/5] train loss: 0.3278 val loss: 0.3296 val acc: 0.8881
epoch [3/5] train loss: 0.3086 val loss: 0.3241 val acc: 0.8919
epoch [4/5] train loss: 0.2943 val loss: 0.3212 val acc: 0.8932
epoch [5/5] train loss: 0.2832 val loss: 0.3214 val acc: 0.8947


In [8]:
# testing 1
text = "oyimga ko'rsattim bo'lar ekan, yoqmasa kerak deb o'ylagandim"
probs = model.predict(text, tokenizer)[0].tolist()
out = np.argmax(probs)
print(probs)
print('postive' if out == 2 else 'neutral' if out == 1 else 'negative')

[0.14750970900058746, 0.06039566546678543, 0.7920945882797241]
postive


In [10]:
# testing 2
text2 = "oyimga ko'rsattim bo'lar ekan, yoqmasa kerak deb o'ylagandim, lekin o'zimga to'grisi vashe ishlashi yoqmadi"
probs2 = model.predict(text2, tokenizer)[0].tolist()
out2 = np.argmax(probs2)
print(probs2)
print('postive' if out2 == 2 else 'neutral' if out2 == 1 else 'negative')

[0.8540540933609009, 0.109282948076725, 0.036662884056568146]
negative


In [11]:
# testing 3
text3 = "telefon boshida yaxsh ishladi, keyin o'zidan o'zi ekrani ishlamiy qoldi"
probs3 = model.predict(text3, tokenizer)[0].tolist()
out3 = np.argmax(probs3)
print(probs3)
print('postive' if out3 == 2 else 'neutral' if out3 == 1 else 'negative')

[0.5298020839691162, 0.19840247929096222, 0.271795392036438]
negative


In [12]:
# testing 4
text3 = "sotuvchiga bog'lanib bomayapti, texnikasi yaxsh chiqmadi"
probs3 = model.predict(text3, tokenizer)[0].tolist()
out3 = np.argmax(probs3)
print(probs3)
print('postive' if out3 == 2 else 'neutral' if out3 == 1 else 'negative')

[0.548626720905304, 0.2190316766500473, 0.23234164714813232]
negative


In [13]:
# testing 5
text3 = "boshida yaxsh ishladi, keyin buzilib qoldi"
probs3 = model.predict(text3, tokenizer)[0].tolist()
out3 = np.argmax(probs3)
print(probs3)
print('postive' if out3 == 2 else 'neutral' if out3 == 1 else 'negative')

[0.4478980004787445, 0.2135918289422989, 0.3385101854801178]
negative


In [14]:
def count_module_params(module):
    return sum(p.numel() for p in module.parameters() if p.requires_grad)

embedding_params = count_module_params(model.token_embedding_table)
pos_embedding_params = count_module_params(model.position_embedding_table)
classifier_params = count_module_params(model.classifier)

attn_params = 0
ffw_params = 0
for block in model.blocks:
    attn_params += count_module_params(block.sa)
    ffw_params += count_module_params(block.ffw)

print(f"Embedding layer: {embedding_params}")
print(f"Positional embedding: {pos_embedding_params}")
print(f"Attention projections: {attn_params}")
print(f"Feed-forward block: {ffw_params}")
print(f"Classifier: {classifier_params}")

print(f"Total trainable parameters: {embedding_params + pos_embedding_params + attn_params + ffw_params + classifier_params}")

Embedding layer: 3840000
Positional embedding: 4096
Attention projections: 196992
Feed-forward block: 395136
Classifier: 387
Total trainable parameters: 4436611


In [15]:
# SAVING MODEL AND TOKENIZER
torch.save(model.state_dict(), "encoder_transformer_classifier.pth")
tokenizer.save("tokenizer.json")