# Build DataLoader for Malaysia Kini data

In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
import argparse
import yaml
import os
import torch
import torch.nn as nn

from utils.dataloader import get_dataloader_and_vocab
from utils.trainer import Trainer
from utils.helper import (
    get_model_class,
    get_optimizer_class,
    get_lr_scheduler,
    save_config,
    save_vocab,
)

## 1: Configuration File & Delete existing folder

In [3]:
config = {'model_name': 'skipgram',
#  'dataset': 'WikiText2',
#  'data_dir': 'data/',
 'train_batch_size': 96,
 'val_batch_size': 96,
 'shuffle': True,
 'optimizer': 'Adam',
 'learning_rate': 0.025,
 'epochs': 5,
 'train_steps': None,
 'val_steps': None,
 'checkpoint_frequency': None,
 'model_dir': 'weights/skipgram_MalaysiaKini'}

In [4]:
import shutil
if os.path.exists(config['model_dir']):
#     os.removedirs(config['model_dir'])
    shutil.rmtree(config['model_dir'])

#Create model directory
os.makedirs(config["model_dir"])

In [5]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

def to_map_style_dataset(iter_data):
    r"""Convert iterable-style dataset to map-style dataset.

    args:
        iter_data: An iterator type object. Examples include Iterable datasets, string list, text io, generators etc.


    Examples:
        >>> from torchtext.datasets import IMDB
        >>> from torchtext.data import to_map_style_dataset
        >>> train_iter = IMDB(split='train')
        >>> train_dataset = to_map_style_dataset(train_iter)
        >>> file_name = '.data/EnWik9/enwik9'
        >>> data_iter = to_map_style_dataset(open(file_name,'r'))
    """

    # Inner class to convert iterable-style to map-style dataset
    class _MapStyleDataset(torch.utils.data.Dataset):
        def __init__(self, iter_data):
            # TODO Avoid list issue #1296
#             self._data = list(iter_data)
            self._data =  iter_data

        def __len__(self):
            return len(self._data)

        def __getitem__(self, idx):
            return self._data[idx]

    return _MapStyleDataset(iter_data)

## 2: Read data and create DataLoader class

In [17]:
from torch.utils.data import Dataset, DataLoader

class MalaysiaKiniData(Dataset):
    def __init__(self, data):
        #load data
        self.data = data
        
    def __getitem__(self, index):
        #return exact data
        return self.data[index]
    
    def __len__(self):
        #return data length
        return len(self.data)

In [20]:
from utils.dataloader import collate_skipgram, collate_cbow, build_vocab
from torch.utils.data import DataLoader
from functools import partial
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd

def get_dataloader_and_vocab_malay(
    model_name, ds_path, ds_column, batch_size, shuffle, vocab=None
):

    df = pd.read_csv(ds_path)
    df = df.dropna(how="any")
    tokenizer = get_tokenizer("basic_english")

    def yield_sentences():
        for sent in df[ds_column]:
            yield sent

    sentences_generator = yield_sentences()

    data_iter = MalaysiaKiniData(df[ds_column])
    tokenizer = get_tokenizer("basic_english")

    
    if not vocab:
        vocab = build_vocab(sentences_generator, tokenizer)
        
    text_pipeline = lambda x: vocab(tokenizer(x))

    if model_name == "cbow":
        collate_fn = collate_cbow
    elif model_name == "skipgram":
        collate_fn = collate_skipgram
    else:
        raise ValueError("Choose model from: cbow, skipgram")
    print("Vocab size: ", len(vocab.get_stoi()))
    dataloader = DataLoader(
        data_iter,
        batch_size=batch_size,
        shuffle=shuffle,
        collate_fn=partial(collate_fn, text_pipeline=text_pipeline),
    )
    return dataloader, vocab

In [23]:
#Load train dataset

train_dataloader, vocab = get_dataloader_and_vocab_malay(
    model_name=config["model_name"],
    ds_path = "data/malaysia-kini/train.csv",
    ds_column = "Target",
    batch_size=config["train_batch_size"],
#     shuffle=config["shuffle"],
    shuffle=False,
    vocab=None,
)

#Get vocab size
vocab_size = len(vocab.get_stoi())
print(f"Vocabulary size: {vocab_size}")


Vocab size:  1715
Vocabulary size: 1715


In [24]:
#Load Validation dataset
val_dataloader, _ = get_dataloader_and_vocab_malay(
    model_name=config["model_name"],
    ds_path = "data/malaysia-kini/valid.csv",
    ds_column = "Target",
    batch_size=config["train_batch_size"],
#     shuffle=config["shuffle"],
    shuffle=False,
    vocab=None,
)

Vocab size:  380


In [25]:
#Get vocab size
vocab_size = len(vocab.get_stoi())
print(f"Vocabulary size: {vocab_size}")

#Get model config
model_class = get_model_class(config["model_name"])
model = model_class(vocab_size=vocab_size)
criterion = nn.CrossEntropyLoss()

#Model parameters : optimizer, learning rate
optimizer_class = get_optimizer_class(config["optimizer"])
optimizer = optimizer_class(model.parameters(), lr=config["learning_rate"])
lr_scheduler = get_lr_scheduler(optimizer, config["epochs"], verbose=True)

Vocabulary size: 1715
Adjusting learning rate of group 0 to 2.5000e-02.


In [26]:
#Set training on CUDA if CUDA available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
#Model training
trainer = Trainer(
    model=model,
    epochs=config["epochs"],
    train_dataloader=train_dataloader,
    train_steps=config["train_steps"],
    val_dataloader=val_dataloader,
    val_steps=config["val_steps"],
    criterion=criterion,
    optimizer=optimizer,
    checkpoint_frequency=config["checkpoint_frequency"],
    lr_scheduler=lr_scheduler,
    device=device,
    model_dir=config["model_dir"],
    model_name=config["model_name"],
)

#Finish training
trainer.train()
print("Training finished.")

trainer.save_model()
trainer.save_loss()
save_vocab(vocab, config["model_dir"])
save_config(config, config["model_dir"])
print("Model artifacts saved to folder:", config["model_dir"])

Epoch: 1/5, Train Loss=5.47417, Val Loss=4.04396
Adjusting learning rate of group 0 to 2.0000e-02.
Epoch: 2/5, Train Loss=5.32944, Val Loss=4.00713
Adjusting learning rate of group 0 to 1.5000e-02.
Epoch: 3/5, Train Loss=5.28547, Val Loss=3.98888
Adjusting learning rate of group 0 to 1.0000e-02.
Epoch: 4/5, Train Loss=5.25762, Val Loss=3.97517
Adjusting learning rate of group 0 to 5.0000e-03.
Epoch: 5/5, Train Loss=5.22800, Val Loss=3.96532
Adjusting learning rate of group 0 to 0.0000e+00.
Training finished.
Model artifacts saved to folder: weights/skipgram_MalaysiaKini


## Read data all Malaysia Kini data

In [2]:
import pandas as pd

In [24]:
df = pd.read_excel("data/malaysia-kini/06_MalaysiaKini & Awani Articles 2022.xlsx")

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116030 entries, 0 to 116029
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Target       116030 non-null  object
 1   Source       116030 non-null  object
 2   Text Length  116030 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


In [26]:
all_data = df['Target'].tolist()

In [27]:
all_data.extend( df['Source'].tolist())

In [28]:
dfSeries = pd.Series(all_data)

In [29]:
dfSeries.drop_duplicates(inplace=True)

In [30]:
dfSeries.dropna(inplace=True)

In [31]:
dfSeries.reset_index(drop=True, inplace=True)

In [32]:
dfSeries

0         global | kumpulan hak asasi manusia dan duta u...
1         amnesty international dan human rights watch b...
2         duta ukraine ke amerika syarikat, oksana marka...
3         "mereka menggunakan bom vakum hari ini," kata ...
4         "...kemusnahan yang cuba dikenakan oleh rusia ...
                                ...                        
220321    beliau berkata para pelabur menantikan dengan ...
220322    "ringgit dijangka diniagakan tggi dn menguji p...
220323    berbanding mata wang utama yg lain, ringgit be...
220324    unit tempatan tue lebih tnggi berbanding dolar...
220325    bagaimanapun, ringgit lemah berbanding yen jep...
Length: 220326, dtype: object