In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
import regex as re
import swifter
import warnings

from sklearn.feature_extraction.text import CountVectorizer

warnings.filterwarnings("ignore")

vocab_size=30000

batch_size = 64
block_size = 80 # number of tokens in one sentence
learning_rate = 3e-4


max_iters = 5000
eval_iters = 200
eval_interval = 500


n_embd = 84
n_head = 4
n_layer = 4
dropout = 0.2



text_column = "clean_text"
target_column = "rnk"




uzum_reviews_df = pd.read_parquet("./uzum_dataset.parquet", engine='pyarrow')
uzum_reviews_df["len"] = uzum_reviews_df["normalized_review_text"].str.len()
uzum_reviews_filtered_df = uzum_reviews_df[uzum_reviews_df["len"] <= block_size]
rating_map = {
    'very poor' : 0,
    'poor' : 0,
    'fair' : 1,
    'good' : 2,
    'excellent' : 2
}

uzum_reviews_filtered_df["rnk"] = uzum_reviews_filtered_df["rating"].map(rating_map)


def normalize_uzum_reviews(df: pd.DataFrame) -> pd.DataFrame:
    """
    cleans the text based on the following criterias listed below
    :param df: pandas dataframe
    :returns: cleaned pandas dataframe
    """

    latin = r"\p{Latin}"
    cyrillic = r"\p{Cyrillic}"
    digits = r"\p{Number}"


    allowed_re = re.compile(fr"(?:{latin}|{cyrillic}|{digits}|\s)")

    final_clean = {'ø','ʔ','ʕ','ʖ','ᴥ','ᵕ','⅚','ᴗ'}

    latin_map = {
    "à": "a", "á": "a", "â": "a", "ã": "a",
    "ç": "c",
    "è": "e", "é": "e", "ë": "e",
    "ì": "i", "í": "i",
    "ñ": "n",
    "ò": "o", "ó": "o", "ô": "o", "õ": "o", "ö": "o",
    "ù": "u", "ú": "u", "û": "u", "ü": "u",
    "ý": "y", "ÿ": "y",
    "ĝ": "g'", "ğ": "g'", "ġ": "g'", "ģ": "g'",
    "ĥ": "h",
    "ı": "i",
    "ĵ": "j",
    "ķ": "k",
    "ĺ": "l", "ļ": "l",
    "ń": "n", "ň": "n",
    "ō": "o'", "ŏ": "o'", "ő": "o'",
    "ŕ": "r",
    "ś": "s", "ş": "sh",
    "ũ": "u", "ū": "u", "ů": "u",
    "ź": "z", "ž": "j",
    "ǒ": "o'", "ǫ": "q",
    "ǵ": "g'",
    "ɓ": "b",
    "ə": "e",
    '²': '2',
    '³': '3',
    '¹': '1',
    'ď': 'd',
    'ɢ': 'g',
    'ɪ': 'i',
    'ɴ': 'n',
    'ʀ': 'r',
    'ʏ': 'y',
    'ʜ': 'h',
    'ʟ': 'l',
    'ө': 'o',
    'ᴀ': 'a',
    'ᴄ': 'c',
    'ᴅ': 'd',
    'ᴇ': 'e',
    'ᴊ': 'j',
    'ᴋ': 'k',
    'ᴍ': 'm',
    'ᴏ': 'o',
    'ᴘ': 'p',
    'ᴛ': 't',
    'ᴜ': 'u',
    '⁰': '0',
    '⁴': '4',
    '⁵': '5'
}


    def normalize_text(text: str) -> str:
        out = []
        for ch in text:
        # skip unwanted characters
            if ch in final_clean:
               continue

        # keep only allowed characters (latin, cyrillic, digits, spaces)
            if not allowed_re.fullmatch(ch):
               continue

        # map special latin → uzbek letters
            out.append(latin_map.get(ch, ch))

        return "".join(out)


    df['clean_text'] = df["normalized_review_text"].astype(str).swifter.apply(normalize_text)

    return df


def get_bow_data():
    uzum_df = normalize_uzum_reviews(uzum_reviews_filtered_df)

    text = uzum_df["clean_text"]
    target = uzum_df["rnk"]

    vectorizer = CountVectorizer(max_features=5000)

    X_bow = vectorizer.fit_transform(text)

    X = torch.tensor(X_bow.toarray(), dtype=torch.float)
    y = torch.tensor(target, dtype=torch.long)

    return X, y

In [2]:
!pip install swifter

Collecting swifter
  Downloading swifter-1.4.0.tar.gz (1.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m1.1/1.2 MB[0m [31m34.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: swifter
  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Created wheel for swifter: filename=swifter-1.4.0-py3-none-any.whl size=16505 sha256=78012f03285d8fbbaad46e369b860e9b1c40e0ac697a38cf356e8e0d2efca3e3
  Stored in directory: /root/.cache/pip/wheels/d9/31/ff/ff51141a088571a9f672449e5aad5ea8bb35ca5d95ba135f30
Successfully built swifter
Installing collected packages: swifter
Successfully installed swifter-1.4.0


In [4]:
X, y = get_bow_data()

Pandas Apply:   0%|          | 0/310217 [00:00<?, ?it/s]

In [5]:
class BowModel(nn.Module):
    def __init__(self, in_feat, n_target):
        super().__init__()
        self.ly = nn.Linear(in_feat, n_target)

    def forward(self, x):
        return self.ly(x)


In [6]:
import torch
from torch.utils.data import TensorDataset, DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

# dataset
dataset = TensorDataset(X, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# model
features = X.shape[1]
n_classes = len(set(y.tolist()))

bow_model = BowModel(features, n_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(bow_model.parameters(), lr=0.001)

max_epoch = 5

for epoch in range(max_epoch):
    bow_model.train()
    total_loss = 0

    for xb, yb in loader:
        xb, yb = xb.to(device), yb.to(device)

        optimizer.zero_grad()
        outputs = bow_model(xb)

        loss = criterion(outputs, yb)
        loss.backward()

        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{max_epoch}, Loss: {total_loss:.4f}")


Epoch 1/5, Loss: 2029.3422
Epoch 2/5, Loss: 1591.2596
Epoch 3/5, Loss: 1538.6239
Epoch 4/5, Loss: 1514.0190
Epoch 5/5, Loss: 1499.8845
