In [1]:
from fasttext_classifier.model import FastTextClassifier, FastTextClassifierConfig
from fasttext_classifier.encoder import FastTextEncoder
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning import Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = "ag_news"
dataset = load_dataset(dataset_name)

Found cached dataset ag_news (/Users/josephlee/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 196.69it/s]


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [4]:
config = FastTextClassifierConfig(
    num_classes=4,
    batch_size=256,
    lr=0.5,
    min_n=2,
    max_n=6,
    word_ngrams=2,
    dim=10,
    bucket=10000,
)

initialize tokenizer

In [5]:
def _tokenize(s):
    return s.split()

def collate_batch(batch):
    label_list = torch.LongTensor([x["label"] for x in batch])
    out = tokenizer(
        [_tokenize(x["text"]) for x in batch], return_tensors="pt", ft_mode=True
    )
    # input_ids = out["input_ids"]
    return {
        "label": label_list,
        "input_ids": out["input_ids"]
    }

In [6]:
tokenizer = FastTextEncoder([_tokenize(x) for x in dataset["train"]["text"]], config=config)

In [7]:
config.vocab_size = tokenizer.vocab_size

In [8]:
trainloader = DataLoader(
    dataset["train"], batch_size=config.batch_size, shuffle=False, collate_fn=collate_batch
)
testloader = DataLoader(
    dataset["test"], batch_size=config.batch_size, shuffle=False, collate_fn=collate_batch
)

In [16]:
model = FastTextClassifier(config)

In [17]:
from pytorch_lightning.callbacks import LearningRateMonitor
lr_monitor = LearningRateMonitor(logging_interval='step')

In [18]:
trainer = Trainer(
    logger=[pl_loggers.TensorBoardLogger(save_dir="./"), pl_loggers.CSVLogger(save_dir="./")],
    accelerator="cpu",
    max_steps=100,
    # max_epochs=2,
    # callbacks=[lr_monitor]
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [19]:
trainer.fit(model, trainloader, testloader)


  | Name       | Type                | Params
---------------------------------------------------
0 | criterion  | CrossEntropyLoss    | 0     
1 | embedding  | Embedding           | 2.0 M 
2 | fc1        | Linear              | 44    
3 | val_acc    | MulticlassAccuracy  | 0     
4 | val_prec   | MulticlassPrecision | 0     
5 | val_recall | MulticlassRecall    | 0     
6 | val_f1     | MulticlassF1Score   | 0     
---------------------------------------------------
2.0 M     Trainable params
0         Non-trainable params
2.0 M     Total params
7.925     Total estimated model params size (MB)


Epoch 0:  20%|██████████████████████▊                                                                                           | 100/499 [00:11<00:44,  8.93it/s, loss=1.6, v_num=4]

`Trainer.fit` stopped: `max_steps=100` reached.


Epoch 0:  20%|██████████████████████▊                                                                                           | 100/499 [00:11<00:44,  8.90it/s, loss=1.6, v_num=4]


In [20]:
trainer.validate(model, testloader)

Validation DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:02<00:00, 11.70it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        eval:acc                   0.25
      eval:f1score          0.10000000149011612
     eval:precision               0.0625
       eval:recall                 0.25
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'eval:acc': 0.25,
  'eval:precision': 0.0625,
  'eval:recall': 0.25,
  'eval:f1score': 0.10000000149011612}]

In [14]:
outs = trainer.predict(model, testloader)

  rank_zero_warn(


Predicting DataLoader 0: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:02<00:00, 11.82it/s]


In [15]:
outs[0]

{'label': array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]),
 'score': array([0.5766617 , 0.5762534 , 0.5765954 , 0.5762476 , 0.5763752 ,
        0.57538366, 0.575661  , 0.5753223 , 0.57