In [1]:
import json
import pickle
from argparse import ArgumentParser, Namespace
from pathlib import Path
from typing import Dict

import torch
from torch.utils.data import DataLoader
from dataset import SeqClsDataset
from model import SeqClassifier
from utils import Vocab

In [2]:
cache_dir = Path("./cache/intent/")
test_file = Path("data/intent/test.json")
max_len = 128
with open(cache_dir / "vocab.pkl", "rb") as f:
    vocab: Vocab = pickle.load(f)

intent_idx_path = cache_dir / "intent2idx.json"
intent2idx: Dict[str, int] = json.loads(intent_idx_path.read_text())

data = json.loads(test_file.read_text())
dataset = SeqClsDataset(data, vocab, intent2idx, max_len)

dataset

<dataset.SeqClsDataset at 0x7f5ef45e14f0>

In [3]:
batch_size = 128
test_data_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=dataset.collate_fn,
)

In [4]:
for batch in test_data_loader:
    print(batch)
    print(batch.get("ids").shape)
    break

{'ids': tensor([[1909, 3481, 2562,  ...,    0,    0,    0],
        [1526, 1208,    0,  ...,    0,    0,    0],
        [1235, 2276, 2259,  ...,    0,    0,    0],
        ...,
        [1235, 2276, 2562,  ...,    0,    0,    0],
        [5601, 6444, 2830,  ...,    0,    0,    0],
        [1235, 2276, 3800,  ...,    0,    0,    0]]), 'labels': [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, 

In [19]:
hidden_size = 512
num_layers = 2
dropout = 0.1
bidirectional = True
num_class = 150
device = "cuda"
embeddings = torch.load(cache_dir / "embeddings.pt")


lstm_model = SeqClassifier(
    embeddings,
    int(hidden_size),
    int(num_layers),
    float(dropout),
    int(bidirectional),
    num_class,
)
lstm_model.load_state_dict(torch.load("lstm.pt"))
lstm_model.to(device)
lstm_model.eval()

predict_list = list()

for i, batch in enumerate(test_data_loader):
    ids = batch["ids"].to(device)
    pred = lstm_model(ids).to(device)
    
    pred_labels = pred.argmax(dim=1).tolist()
    for idx, label in zip(batch["index"], pred_labels):
        predict_list.append([idx, label])
    
    # print(pred.argmax(dim=1))
    # print(batch["index"])

In [20]:
print(predict_list[:5])

[['test-1237', 30], ['test-4111', 102], ['test-3093', 33], ['test-3623', 102], ['test-2637', 43]]


In [22]:
predict_list = sorted(predict_list, key=lambda x : int(x[0].replace("test-", "")))
predict_list[:5]

[['test-0', 56],
 ['test-1', 114],
 ['test-2', 51],
 ['test-3', 141],
 ['test-4', 147]]

In [28]:
import pandas as pd

intent2idx

idx2intent = {v: k for k, v in intent2idx.items()}


predict_labels = [idx2intent[item[1]] for item in predict_list]
predict_idx = [item[0] for item in predict_list]

In [30]:
display(predict_labels[:5])
display(predict_idx[:5])

['bill_balance',
 'restaurant_suggestion',
 'report_lost_card',
 'timezone',
 'what_is_your_name']

['test-0', 'test-1', 'test-2', 'test-3', 'test-4']

In [32]:
results_csv = pd.DataFrame(
    {
        "id": predict_idx,
        "intent": predict_labels
    }
)
display(results_csv)

Unnamed: 0,id,intent
0,test-0,bill_balance
1,test-1,restaurant_suggestion
2,test-2,report_lost_card
3,test-3,timezone
4,test-4,what_is_your_name
...,...,...
4495,test-4495,what_can_i_ask_you
4496,test-4496,restaurant_reviews
4497,test-4497,travel_suggestion
4498,test-4498,pto_request


In [34]:
dataset.num_classes

150