## Import dependencies

In [1]:
import numpy as np
import random
import json
import copy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

import glob

from tqdm.notebook import tqdm

### Fix random seed

In [2]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
torch.backends.cudnn.deterministic=True 

### Import meta info (tokens, number of users )

In [3]:
meta = json.load(open('./meta.json', 'r'))
tokens = meta['tokens']
num_token = len(tokens)
num_user = meta['num_user']

In [4]:
print('In dataset, there are {} number of tokens (words) and these tweets are from {} users'.format(num_token, num_user))

In dataset, there are 13369 number of tokens (words) and these tweets are from 8 users


### Load train and validataion dataset

In [5]:
train_data = json.load(open('./train.json', 'r'))
valid_data = json.load(open('./valid.json', 'r'))
s_idx = 0

In [6]:
print('{} tweets in train dataset, {} tweets in valid dataset.'.format(len(train_data), len(valid_data)))
print('Each json file is a list of dictionary, and each dictionary has information of tweets')
print('[TWEET INFO]: user id, sentence, processed token id.')
print()

print('Sample train data: ', train_data[s_idx])
print()
print('Note that: tokens.index(word) = token_id')
print()
print('Example:')
print(train_data[0]['token_id'])
print([tokens.index(w) for w in train_data[s_idx]['sentence'].split()])

6400 tweets in train dataset, 356 tweets in valid dataset.
Each json file is a list of dictionary, and each dictionary has information of tweets
[TWEET INFO]: user id, sentence, processed token id.

Sample train data:  {'user_id': 0, 'sentence': 'i recently met lakeisha crum the first in her family to go to college loved her story', 'token_id': [5721, 9659, 7459, 6629, 2686, 11853, 4447, 5870, 5460, 4236, 12017, 4981, 12017, 2197, 7047, 5460, 11310]}

Note that: tokens.index(word) = token_id

Example:
[5721, 9659, 7459, 6629, 2686, 11853, 4447, 5870, 5460, 4236, 12017, 4981, 12017, 2197, 7047, 5460, 11310]
[5721, 9659, 7459, 6629, 2686, 11853, 4447, 5870, 5460, 4236, 12017, 4981, 12017, 2197, 7047, 5460, 11310]


### Define Dataset and DataLoader
- Note that below code for dataset and dataloader only supports `batch_size = 1`.
- Try to find out a way to batchfy the data.
- Even if you batchfy the data, put the `token_id` information into `sample['token_id']`

In [7]:
class tweetDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        sample['token_id'] = torch.Tensor(sample['token_id'])
        return sample

In [8]:
train_dataset = tweetDataset(train_data)
valid_dataset = tweetDataset(valid_data)

train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, shuffle=True)

### Sample datapoint information

In [9]:
sample = next(iter(train_dataloader))

print('Sample from train dataloader: ')
print('USER ID: ', sample['user_id'])
print('TOKEN ID: ', sample['token_id'])
print('TOKEN ID shape should be BATCH by LENGTH: ', sample['token_id'].shape)

Sample from train dataloader: 
USER ID:  tensor([5])
TOKEN ID:  tensor([[13111., 11092., 11990., 11853.,  4408.,  6661., 13196.,  1048., 12924.,
         11842.,  2258.,  3885.,  7569.,  7116.,  5870.,  7319.,  6993.]])
TOKEN ID shape should be BATCH by LENGTH:  torch.Size([1, 17])


### Define model based on LSTM
- Note that below code for model only supports `batch_size = 1`.
- Try to find out a way to use mini-batch.

```diff
- You must make your class name as "Model", as below.
- You must make your model work with the input of sample['token_id']
```

In [10]:
class Model(nn.Module):
    def __init__(self, num_token, num_user, embed_dim, rnn_dim, num_layers):
        super(Model, self).__init__()
        self.num_token = num_token
        self.num_user = num_user
        self.embed_dim = embed_dim
        self.rnn_dim = rnn_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(num_token, embed_dim)
        self.rnn = nn.LSTM(embed_dim, rnn_dim, num_layers=num_layers, batch_first=True)
        self.out_linear = nn.Linear(rnn_dim, num_user)
        
    def forward(self, token_id):
        embed = self.embedding(token_id)
        out, _ = self.rnn(embed)
        return self.out_linear(out[:, -1])

### Make an instance of model and define optimizer

In [11]:
device = 'cuda'

model = Model(num_token, num_user, embed_dim=512, rnn_dim=1024, num_layers=1).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5, weight_decay=1e-7)

### Number of parameter information
```diff
- The number of parameters should not exceed 20,000,000 !!
- DO NOT USE TRANSFORMER-BASED MODELS!!
- Transformer-based models will not be accepted as a submission.
```

In [12]:
num_param = sum(p.numel() for p in model.parameters())
print('Number of parameters: {}'.format(num_param))
print('[NOTE] Number of parameters SHOULD NOT exceed 20,000,000 (20 million).')

Number of parameters: 13152776
[NOTE] Number of parameters SHOULD NOT exceed 20,000,000 (20 million).


### Test the model
```diff
- Test the model if it generates proper output, which shape is B by num_user
```

In [13]:
pred = model(sample['token_id'].long().to(device))
print('Prediction shape would be BATCH X NUM_USER(OUTPUT) : ', pred.shape)

Prediction shape would be BATCH X NUM_USER/OUTPUT :  torch.Size([1, 8])


### Run training for 100 epochs

In [14]:
criteria = nn.CrossEntropyLoss()
avg_loss = 0.0
best_valid_accu = 0.0
best_epoch = -1
best_model = None
num_epoch = 30

for epoch in tqdm(range(num_epoch)):
    # start training
    for sample in train_dataloader:
        model.train()
        optimizer.zero_grad()

        pred = model(sample['token_id'].long().to(device))

        loss = criteria(pred, sample['user_id'].long().to(device))

        loss.backward()
        optimizer.step()

        avg_loss += loss.item() / len(train_dataloader)

    # start validation
    correct_cnt = 0.0
    data_cnt = 0.0
    for sample in valid_dataloader:
        model.eval()

        with torch.no_grad():
            pred = model(sample['token_id'].long().to(device))

        pred_user_id = torch.argmax(pred, dim=-1)

        accu = pred_user_id.detach().cpu() == sample['user_id']

        correct_cnt += torch.sum(accu)
        data_cnt += sample['token_id'].shape[0]

    # calculate best valid accuracy, and save the best model. 
    curr_valid_accu = (correct_cnt / data_cnt).item()

    best_valid_accu = max(best_valid_accu, curr_valid_accu)
    if best_valid_accu == curr_valid_accu:
        best_epoch = epoch
        best_model = copy.deepcopy(model)
        torch.save(best_model.state_dict(), 'best_baseline.pth')
        print('[EPOCH {}] BEST VALID ACCURACY UPDATED: {}'.format(epoch, best_valid_accu))

  0%|          | 0/30 [00:00<?, ?it/s]

[EPOCH 0] BEST VALID ACCURACY UPDATED: 0.3904494345188141
[EPOCH 1] BEST VALID ACCURACY UPDATED: 0.45505619049072266
[EPOCH 2] BEST VALID ACCURACY UPDATED: 0.5758426785469055
[EPOCH 3] BEST VALID ACCURACY UPDATED: 0.617977499961853
[EPOCH 5] BEST VALID ACCURACY UPDATED: 0.6348314881324768
[EPOCH 6] BEST VALID ACCURACY UPDATED: 0.6404494643211365
[EPOCH 7] BEST VALID ACCURACY UPDATED: 0.6404494643211365
[EPOCH 8] BEST VALID ACCURACY UPDATED: 0.6432584524154663
[EPOCH 9] BEST VALID ACCURACY UPDATED: 0.6432584524154663
[EPOCH 10] BEST VALID ACCURACY UPDATED: 0.6432584524154663
[EPOCH 13] BEST VALID ACCURACY UPDATED: 0.648876428604126
[EPOCH 14] BEST VALID ACCURACY UPDATED: 0.6516854166984558
[EPOCH 15] BEST VALID ACCURACY UPDATED: 0.6516854166984558
[EPOCH 17] BEST VALID ACCURACY UPDATED: 0.6713483333587646
[EPOCH 18] BEST VALID ACCURACY UPDATED: 0.6797752976417542
[EPOCH 27] BEST VALID ACCURACY UPDATED: 0.6910112500190735


In [15]:
print('FINISHED TRAINING : BEST MODEL AT EPOCH {} WITH ACCURACY {}'.format(best_epoch, best_valid_accu))

FINISHED TRAINING : BEST MODEL AT EPOCH 27 WITH ACCURACY 0.6910112500190735
