In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import regex as re
import swifter
from torch.utils.data import TensorDataset, DataLoader, random_split
from tokenizers import Tokenizer, models, pre_tokenizers, trainers

import torch
import torch.nn as nn


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

vocab_size = 30000
pad_token = '<pad>'
unk_token = '<unk>'
block_size = 80

Using device: cuda


### Main Goals:
- first, convert all text cleaning process into one function to make it reusable
- then, training different 2 models

In [None]:
def get_normalized_customer_reviews():
    uzum_df = pd.read_parquet("./uzum_dataset.parquet", engine='pyarrow')
    rating_map = {
    'poor' : 0,
    'very poor' : 0,
    'fair' : 1,
    'good' : 2,
    'excellent' : 2
    }

    uzum_df["rank"] = uzum_df["rating"].map(rating_map)
    uzum_df.drop('rating', axis=1, inplace=True)

    latin = r"\p{Latin}"
    cyrillic = r"\p{Cyrillic}"
    digits = r"\p{Number}"


    allowed_re = re.compile(fr"(?:{latin}|{cyrillic}|{digits}|\s)")

    final_clean = {'ø','ʔ','ʕ','ʖ','ᴥ','ᵕ','⅚','ᴗ'}

    latin_map = {
    "à": "a", "á": "a", "â": "a", "ã": "a",
    "ç": "c",
    "è": "e", "é": "e", "ë": "e",
    "ì": "i", "í": "i",
    "ñ": "n",
    "ò": "o", "ó": "o", "ô": "o", "õ": "o", "ö": "o",
    "ù": "u", "ú": "u", "û": "u", "ü": "u",
    "ý": "y", "ÿ": "y",
    "ĝ": "g'", "ğ": "g'", "ġ": "g'", "ģ": "g'",
    "ĥ": "h",
    "ı": "i",
    "ĵ": "j",
    "ķ": "k",
    "ĺ": "l", "ļ": "l",
    "ń": "n", "ň": "n",
    "ō": "o'", "ŏ": "o'", "ő": "o'",
    "ŕ": "r",
    "ś": "s", "ş": "sh",
    "ũ": "u", "ū": "u", "ů": "u",
    "ź": "z", "ž": "j",
    "ǒ": "o'", "ǫ": "q",
    "ǵ": "g'",
    "ɓ": "b",
    "ə": "e",
    '²': '2',
    '³': '3',
    '¹': '1',
    'ď': 'd',
    'ɢ': 'g',
    'ɪ': 'i',
    'ɴ': 'n',
    'ʀ': 'r',
    'ʏ': 'y',
    'ʜ': 'h',
    'ʟ': 'l',
    'ө': 'o',
    'ᴀ': 'a',
    'ᴄ': 'c',
    'ᴅ': 'd',
    'ᴇ': 'e',
    'ᴊ': 'j',
    'ᴋ': 'k',
    'ᴍ': 'm',
    'ᴏ': 'o',
    'ᴘ': 'p',
    'ᴛ': 't',
    'ᴜ': 'u',
    '⁰': '0',
    '⁴': '4',
    '⁵': '5'
}


    def normalize_text(text: str) -> str:
        out = []
        for ch in text:
            # skips unnessary ones
            if ch in final_clean:
               continue

            # keeps only necessary chars
            if not allowed_re.fullmatch(ch):
               continue

            # maps final
            out.append(latin_map.get(ch, ch))

        return "".join(out)

    uzum_df["review_text"] = uzum_df["normalized_review_text"].astype(str).swifter.apply(normalize_text)
    uzum_df['len'] = uzum_df['review_text'].astype(str).str.len()
    uzum_df = uzum_df[uzum_df['len'] > 0].drop(['len', 'normalized_review_text'], axis=1)[['review_text', 'rank']]

    return uzum_df


In [None]:
def get_tokenized_data():

    df = get_normalized_customer_reviews()

    tokenizer = Tokenizer(models.BPE())
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()


    trainer = trainers.BpeTrainer(
            vocab_size=vocab_size,
            special_tokens=[pad_token, unk_token]
        )

    tokenizer.train_from_iterator(df["review_text"].astype(str).tolist(), trainer)

    PAD_ID = tokenizer.token_to_id(pad_token)
    UNK_ID = tokenizer.token_to_id(unk_token)

    def encode_and_padding(text):
        ids = tokenizer.encode(text).ids[:block_size]
        ids += [PAD_ID] * (block_size - len(ids))
        return ids

    X_padded = [encode_and_padding(t) for t in df["review_text"]]

    data = torch.tensor(X_padded, dtype=torch.long)
    targets = torch.tensor(df['rank'].values, dtype=torch.long)

    return data, targets, tokenizer

In [None]:
from sklearn.model_selection import train_test_split

data, targets, tokenizer = get_tokenized_data()

X_train, X_val, y_train, y_val = train_test_split(
    data, targets, test_size=0.2, random_state=42
)

train_ds = TensorDataset(X_train, y_train)
val_ds = TensorDataset(X_val, y_val)

train_loader = DataLoader(
    train_ds, batch_size=32, shuffle=True, pin_memory=True
)

val_loader = DataLoader(
    val_ds, batch_size=32, pin_memory=True
)

In [None]:
class BoWMLPClassifier(nn.Module):
    def __init__(self,hidden_dim=64, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(vocab_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 3)
        )

    def forward(self, x):
        return self.net(x)



def to_bow(x, vocab_size, pad_id=0):
    bow = torch.zeros(x.size(0), vocab_size, device=x.device)
    mask = x != pad_id
    bow.scatter_(1, x * mask, 1.0)
    return bow

In [None]:
model = BoWMLPClassifier().to(device)


criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
PAD_ID = tokenizer.token_to_id(pad_token)


epochs = 5

for epoch in range(epochs):
    # ===== TRAIN =====
    model.train()
    train_loss = 0

    for xb, yb in train_loader:
        xb = xb.to(device, non_blocking=True)
        yb = yb.to(device, non_blocking=True)

        xb_bow = to_bow(xb, vocab_size, PAD_ID)

        optimizer.zero_grad()
        logits = model(xb_bow)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # ===== VALIDATION =====
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True)

            xb_bow = to_bow(xb, vocab_size, PAD_ID)
            logits = model(xb_bow)
            loss = criterion(logits, yb)

            val_loss += loss.item()
            preds = logits.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)

    val_loss /= len(val_loader)
    acc = correct / total

    print(
        f"epoch {epoch+1}: "
        f"train loss = {train_loss:.4f}, "
        f"val loss = {val_loss:.4f}, "
        f"val acc = {acc:.4f}"
    )

Pandas Apply:   0%|          | 0/352151 [00:00<?, ?it/s]

epoch 1: train loss = 0.3340, val loss = 0.3050, val acc = 0.9022
epoch 2: train loss = 0.2850, val loss = 0.3069, val acc = 0.9033
epoch 3: train loss = 0.2634, val loss = 0.3158, val acc = 0.9035
epoch 4: train loss = 0.2468, val loss = 0.3277, val acc = 0.9025
epoch 5: train loss = 0.2323, val loss = 0.3435, val acc = 0.8996


### Comments:
- Although model improved slightly, in fifth iteration, it is getting overfitted