In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


In [7]:
uzum_reviews_df = pd.read_parquet("./uzum_dataset.parquet", engine='pyarrow')

In [9]:
uzum_reviews_df["len"] = uzum_reviews_df["normalized_review_text"].str.len()

In [20]:
uzum_reviews_filtered_df = uzum_reviews_df[uzum_reviews_df["len"] <= 80]

In [22]:
uzum_reviews_filtered_df["rating"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
rating,Unnamed: 1_level_1
excellent,0.739943
very poor,0.130873
good,0.051016
fair,0.043341
poor,0.034827


In [23]:
rating_map = {
    'very poor' : 1,
    'poor' : 1,
    'fair' : 2,
    'good' : 3,
    'excellent' : 3
}

uzum_reviews_filtered_df["rnk"] = uzum_reviews_filtered_df["rating"].map(rating_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uzum_reviews_filtered_df["rnk"] = uzum_reviews_filtered_df["rating"].map(rating_map)


In [25]:
uzum_reviews_filtered_df["rnk"].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
rnk,Unnamed: 1_level_1
3,0.790959
1,0.1657
2,0.043341


In [27]:
uzum_reviews_filtered_df.drop('rating', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uzum_reviews_filtered_df.drop('rating', axis=1, inplace=True)


In [31]:
text = uzum_reviews_filtered_df["normalized_review_text"].tolist()

chars = sorted(list(set("\n".join(text))))

print("Total characters: ", len(chars))

Total characters:  224


In [53]:
import regex as re

# Define allowed scripts
latin = r"\p{Script=Latin}"
cyrillic = r"\p{Script=Cyrillic}"
digits = r"\p{Number}"

# Combine into a single pattern
allowed_re = re.compile(fr"(?:{latin}|{cyrillic}|{digits})$")

# Filter characters
cleaned = [ch for ch in chars if allowed_re.match(ch)]

final_clean = [ 'ø', 'ʔ','ʕ','ʖ','ᴥ','ᵕ','⅚','ᴗ']

clean_chars = [c for c in cleaned if c not in final_clean]

In [56]:
latin_map = {
    "à": "a", "á": "a", "â": "a", "ã": "a",
    "ç": "c",
    "è": "e", "é": "e", "ë": "e",
    "ì": "i", "í": "i",
    "ñ": "n",
    "ò": "o", "ó": "o", "ô": "o", "õ": "o", "ö": "o",
    "ù": "u", "ú": "u", "û": "u", "ü": "u",
    "ý": "y", "ÿ": "y",

    "ĝ": "g'", "ğ": "g'", "ġ": "g'", "ģ": "g'",
    "ĥ": "h",
    "ı": "i",
    "ĵ": "j",
    "ķ": "k",
    "ĺ": "l", "ļ": "l",
    "ń": "n", "ň": "n",
    "ō": "o'", "ŏ": "o'", "ő": "o'",
    "ŕ": "r",
    "ś": "s", "ş": "sh",
    "ũ": "u", "ū": "u", "ů": "u",
    "ź": "z", "ž": "j",  # note: ž is pronounced like "j"
    "ǒ": "o'", "ǫ": "q",
    "ǵ": "g'",
    "ɓ": "b",    # approximate
    "ə": "e",    # Azerbaijani ə → a
    '²' : '2',
    '³' :'3',
    '¹' : '1',
     'ď' : 'd',

     'ɢ' : 'g',
 'ɪ' : 'i',
 'ɴ': 'n',
 'ʀ' : 'r',
 'ʏ' : 'y',
 'ʜ' : 'h',
 'ʟ' : 'l',
    'ө': 'o',
    'ᴀ' : 'a',
 'ᴄ' : 'c',
 'ᴅ' : 'd',
 'ᴇ' : 'e',
 'ᴊ': 'j',
 'ᴋ': 'k',
 'ᴍ' :'m',
 'ᴏ' :'o',
 'ᴘ' : 'p',
 'ᴛ' :'t',
 'ᴜ' :'u',
     '⁰' : '0',
 '⁴' : '4',
 '⁵' : '5'
}


normalized_chars = []

for c in clean_chars:
  if c in latin_map:
    normalized_chars.append(latin_map[c])
  else:
    normalized_chars.append(c)


print(len(sorted(list(set(normalized_chars)))))


54


In [71]:
import regex as re

# ---------- Allowed chars ----------
latin = r"\p{Latin}"
cyrillic = r"\p{Cyrillic}"
digits = r"\p{Number}"

# allow latin + cyrillic + digits + spaces
allowed_re = re.compile(fr"(?:{latin}|{cyrillic}|{digits}|\s)")

final_clean = {'ø','ʔ','ʕ','ʖ','ᴥ','ᵕ','⅚','ᴗ'}

latin_map = {
    "à": "a", "á": "a", "â": "a", "ã": "a",
    "ç": "c",
    "è": "e", "é": "e", "ë": "e",
    "ì": "i", "í": "i",
    "ñ": "n",
    "ò": "o", "ó": "o", "ô": "o", "õ": "o", "ö": "o",
    "ù": "u", "ú": "u", "û": "u", "ü": "u",
    "ý": "y", "ÿ": "y",
    "ĝ": "g'", "ğ": "g'", "ġ": "g'", "ģ": "g'",
    "ĥ": "h",
    "ı": "i",
    "ĵ": "j",
    "ķ": "k",
    "ĺ": "l", "ļ": "l",
    "ń": "n", "ň": "n",
    "ō": "o'", "ŏ": "o'", "ő": "o'",
    "ŕ": "r",
    "ś": "s", "ş": "sh",
    "ũ": "u", "ū": "u", "ů": "u",
    "ź": "z", "ž": "j",
    "ǒ": "o'", "ǫ": "q",
    "ǵ": "g'",
    "ɓ": "b",
    "ə": "e",
    '²': '2',
    '³': '3',
    '¹': '1',
    'ď': 'd',
    'ɢ': 'g',
    'ɪ': 'i',
    'ɴ': 'n',
    'ʀ': 'r',
    'ʏ': 'y',
    'ʜ': 'h',
    'ʟ': 'l',
    'ө': 'o',
    'ᴀ': 'a',
    'ᴄ': 'c',
    'ᴅ': 'd',
    'ᴇ': 'e',
    'ᴊ': 'j',
    'ᴋ': 'k',
    'ᴍ': 'm',
    'ᴏ': 'o',
    'ᴘ': 'p',
    'ᴛ': 't',
    'ᴜ': 'u',
    '⁰': '0',
    '⁴': '4',
    '⁵': '5'
}


def normalize_text(text: str) -> str:
    out = []
    for ch in text:
        # skip unwanted characters
        if ch in final_clean:
            continue

        # keep only allowed characters (latin, cyrillic, digits, spaces)
        if not allowed_re.fullmatch(ch):
            continue

        # map special latin → uzbek letters
        out.append(latin_map.get(ch, ch))

    return "".join(out)


In [72]:
import swifter

uzum_reviews_filtered_df['clean_text'] = uzum_reviews_filtered_df["normalized_review_text"].astype(str).swifter.apply(normalize_text)


Pandas Apply:   0%|          | 0/310217 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uzum_reviews_filtered_df['clean_text'] = uzum_reviews_filtered_df["normalized_review_text"].astype(str).swifter.apply(normalize_text)


In [74]:
normalized_characters = sorted(list(set("\n".join(uzum_reviews_filtered_df["clean_text"].tolist()))))

print("Total characters: ", len(normalized_characters))

Total characters:  54


In [79]:
uzum_df = uzum_reviews_filtered_df[['clean_text', 'rnk']]

uzum_df = uzum_df.rename(columns={'clean_text' : 'review_text'})

In [81]:
text = uzum_df['review_text'].tolist()

In [84]:
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.BpeTrainer(
    vocab_size=30000,
    special_tokens=["<pad>", "<unk>"]
)

# train on your list of texts or text files
tokenizer.train_from_iterator(text, trainer)

PAD_ID = tokenizer.token_to_id("<pad>")
UNK_ID = tokenizer.token_to_id("<unk>")


In [85]:
def padding_sentence(ids, max_len=80, pad_id=PAD_ID):
    if len(ids) < max_len:
        ids += [pad_id] * (max_len - len(ids))
    return ids[:max_len]

X_seq = [padding_sentence(tokenizer.encode(t).ids) for t in text]
print(tokenizer.decode(X_seq[0]))



data = torch.tensor(X_seq, dtype=torch.long)
y = torch.tensor(uzum_df["rnk"], dtype=torch.long)

rahmat juda yoqdi


In [94]:
uzum_df.iloc[2133, :]

Unnamed: 0,2365
review_text,oddiy unchalik emas
rnk,2


In [96]:
tokenizer.decode(X_seq[2133])

'oddiy unchalik emas'

In [99]:
data = torch.tensor(X_seq, dtype=torch.long)
y = torch.tensor(uzum_df["rnk"], dtype=torch.long)