In [1]:
from types import SimpleNamespace
from collections import Counter
import os
import re
import pathlib
import array
import pickle
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

In [2]:
DATASET_VERSION = 'ca-100'
COMPETITION_ROOT = '../input/vectors'
DATASET_ROOT = f'../input/cbow-preprocessing/data/{DATASET_VERSION}'
WORKING_ROOT = f'data/{DATASET_VERSION}'
DATASET_PREFIX = 'ca.wiki'

In [3]:
params = SimpleNamespace(
    embedding_dim = 100,
    window_size = 5,
    batch_size = 1000,
    epochs = 4,
    preprocessed = f'{DATASET_ROOT}/{DATASET_PREFIX}',
    working = f'{WORKING_ROOT}/{DATASET_PREFIX}',
    modelname = f'{WORKING_ROOT}/{DATASET_VERSION}.pt',
    train = True
)

In [4]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    pathlib.Path('/content/drive/My Drive/POE/vectors').mkdir(parents=True, exist_ok=True)
    os.chdir('/content/drive/My Drive/POE/vectors')
except:
    pass

In [5]:
class Vocabulary(object):
    def __init__(self, pad_token='<pad>', unk_token='<unk>', eos_token='<eos>'):
        self.token2idx = {}
        self.idx2token = []
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.eos_token = eos_token
        if pad_token is not None:
            self.pad_index = self.add_token(pad_token)
        if unk_token is not None:
            self.unk_index = self.add_token(unk_token)
        if eos_token is not None:
            self.eos_index = self.add_token(eos_token)

    def add_token(self, token):
        if token not in self.token2idx:
            self.idx2token.append(token)
            self.token2idx[token] = len(self.idx2token) - 1
        return self.token2idx[token]

    def get_index(self, token):
        if isinstance(token, str):
            return self.token2idx.get(token, self.unk_index)
        else:
            return [self.token2idx.get(t, self.unk_index) for t in token]

    def __len__(self):
        return len(self.idx2token)

    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self.__dict__, f)

    def load(self, filename):
        with open(filename, 'rb') as f:
            self.__dict__.update(pickle.load(f))

In [6]:
def batch_generator(idata, target, batch_size, shuffle=True):
    nsamples = len(idata)
    if shuffle:
        perm = np.random.permutation(nsamples)
    else:
        perm = range(nsamples)

    for i in range(0, nsamples, batch_size):
        batch_idx = perm[i:i+batch_size]
        if target is not None:
            yield idata[batch_idx], target[batch_idx]
        else:
            yield idata[batch_idx], None

CBOW model
----------
You can add new parameters to the model in the *\_\_init\_\_()* method with *self.register_buffer()* (for parameters not to be trained):

    self.register_buffer('position_weight', torch.tensor([1,2,2,1], dtype=torch.float32))
 
or *nn.Parameter()* (for parameters to be trained)

    self.position_weight = nn.Parameter(torch.tensor([1,2,2,1], dtype=torch.float32))
    
In both cases, you can reference and use them in the *forward* method as

    self.position_weight

In [7]:
class CBOW(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super().__init__()
        self.emb = nn.Embedding(num_embeddings, embedding_dim, padding_idx=0)
        self.lin = nn.Linear(embedding_dim, num_embeddings, bias=False)
        
        # a) A fixed scalar weight (1,2,2,1) to give more weight to the words that are closer to the predicted central word
        # self.register_buffer('position_weight', torch.tensor([1,2,2,1], dtype=torch.float32).view(1,4,1))
        
        # b) A trained scalar weight for each position
        # self.position_weight = nn.Parameter(torch.tensor([1,2,2,1], dtype=torch.float32).view(1,4,1))
        
        # c) A trained vector weight for each position. Each word vector is element-wise multiplied by the corresponding position-dependen weight,
        #    and then added with the rest of the weighted word vectors
        self.position_weight = nn.Parameter(torch.ones([4,100], dtype=torch.float32))

    def forward(self, inputs):        
        # B * W1
        # U = self.emb(inputs).sum(dim=1)                         # ORIGINAL CODE!
        U = (self.emb(inputs)*self.position_weight).sum(dim=1)    # dimension: B * 4 * 100
        # B * E
        V = self.lin(U)
        # B * V
        return V

In [8]:
def load_preprocessed_dataset(prefix):
    # Try loading precomputed vocabulary and preprocessed data files
    token_vocab = Vocabulary()
    token_vocab.load(f'{prefix}.vocab')
    data = []
    for part in ['train', 'valid', 'test']:
        with np.load(f'{prefix}.{part}.npz') as set_data:
            idata, target = set_data['idata'], set_data['target']
            data.append((idata, target))
            print(f'Number of samples ({part}): {len(target)}')
    print("Using precomputed vocabulary and data files")
    print(f'Vocabulary size: {len(token_vocab)}')
    return token_vocab, data

In [9]:
def train(model, criterion, optimizer, idata, target, batch_size, device, log=False):
    model.train()
    total_loss = 0
    ncorrect = 0
    ntokens = 0
    niterations = 0
    for X, y in batch_generator(idata, target, batch_size, shuffle=True):
        # Get input and target sequences from batch
        X = torch.tensor(X, dtype=torch.long, device=device)
        y = torch.tensor(y, dtype=torch.long, device=device)

        model.zero_grad()
        output = model(X)
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()
        # Training statistics
        total_loss += loss.item()
        ncorrect += (torch.max(output, 1)[1] == y).sum().item()
        ntokens += y.numel()
        niterations += 1
        if niterations == 200 or niterations == 500 or niterations % 1000 == 0:
            print(f'Train: wpb={ntokens//niterations}, num_updates={niterations}, accuracy={100*ncorrect/ntokens:.1f}, loss={total_loss/ntokens:.2f}')

    total_loss = total_loss / ntokens
    accuracy = 100 * ncorrect / ntokens
    if log:
        print(f'Train: wpb={ntokens//niterations}, num_updates={niterations}, accuracy={accuracy:.1f}, loss={total_loss:.2f}')
    return accuracy, total_loss

In [10]:
def validate(model, criterion, idata, target, batch_size, device):
    model.eval()
    total_loss = 0
    ncorrect = 0
    ntokens = 0
    niterations = 0
    y_pred = []
    for X, y in batch_generator(idata, target, batch_size, shuffle=False):
        # Get input and target sequences from batch
        X = torch.tensor(X, dtype=torch.long, device=device)
        output = model(X)
        if target is not None:
            y = torch.tensor(y, dtype=torch.long, device=device)
            loss = criterion(output, y)
            total_loss += loss.item()
            ncorrect += (torch.max(output, 1)[1] == y).sum().item()
            ntokens += y.numel()
            niterations += 1
        else:
            pred = torch.max(output, 1)[1].detach().to('cpu').numpy()
            y_pred.append(pred)

    if target is not None:
        total_loss = total_loss / ntokens
        accuracy = 100 * ncorrect / ntokens
        return accuracy, total_loss
    else:
        return np.concatenate(y_pred)

In [11]:
# Create working dir
pathlib.Path(WORKING_ROOT).mkdir(parents=True, exist_ok=True)

In [12]:
# Select device
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    print("WARNING: Training without GPU can be very slow!")

In [13]:
vocab, data = load_preprocessed_dataset(params.preprocessed)

Number of samples (train): 80234403
Number of samples (valid): 163012
Number of samples (test): 164055
Using precomputed vocabulary and data files
Vocabulary size: 100002


In [14]:
model = CBOW(len(vocab), params.embedding_dim).to(device)

In [15]:
# 'El Periodico' validation dataset
valid_x_df = pd.read_csv(f'{COMPETITION_ROOT}/x_valid.csv')
tokens = valid_x_df.columns[1:]
valid_x = valid_x_df[tokens].apply(vocab.get_index).to_numpy(dtype='int32')
valid_y_df = pd.read_csv(f'{COMPETITION_ROOT}/y_valid.csv')
valid_y = valid_y_df['token'].apply(vocab.get_index).to_numpy(dtype='int32')

In [16]:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(reduction='sum')

train_accuracy = []
wiki_accuracy = []
valid_accuracy = []
for epoch in range(params.epochs):
    acc, loss = train(model, criterion, optimizer, data[0][0], data[0][1], params.batch_size, device, log=True)
    train_accuracy.append(acc)
    print(f'| epoch {epoch:03d} | train accuracy={acc:.1f}%, train loss={loss:.2f}')
    acc, loss = validate(model, criterion, data[1][0], data[1][1], params.batch_size, device)
    wiki_accuracy.append(acc)
    print(f'| epoch {epoch:03d} | valid accuracy={acc:.1f}%, valid loss={loss:.2f} (wikipedia)')
    acc, loss = validate(model, criterion, valid_x, valid_y, params.batch_size, device)
    valid_accuracy.append(acc)
    print(f'| epoch {epoch:03d} | valid accuracy={acc:.1f}%, valid loss={loss:.2f} (El Periódico)')

# Save model
torch.save(model.state_dict(), params.modelname)

Train: wpb=1000, num_updates=200, accuracy=4.0, loss=9.63
Train: wpb=1000, num_updates=500, accuracy=6.5, loss=8.57
Train: wpb=1000, num_updates=1000, accuracy=9.1, loss=7.82
Train: wpb=1000, num_updates=2000, accuracy=12.5, loss=7.10
Train: wpb=1000, num_updates=3000, accuracy=14.8, loss=6.70
Train: wpb=1000, num_updates=4000, accuracy=16.6, loss=6.43
Train: wpb=1000, num_updates=5000, accuracy=18.0, loss=6.24
Train: wpb=1000, num_updates=6000, accuracy=19.1, loss=6.08
Train: wpb=1000, num_updates=7000, accuracy=20.1, loss=5.95
Train: wpb=1000, num_updates=8000, accuracy=20.9, loss=5.85
Train: wpb=1000, num_updates=9000, accuracy=21.7, loss=5.75
Train: wpb=1000, num_updates=10000, accuracy=22.3, loss=5.68
Train: wpb=1000, num_updates=11000, accuracy=22.9, loss=5.60
Train: wpb=1000, num_updates=12000, accuracy=23.4, loss=5.54
Train: wpb=1000, num_updates=13000, accuracy=23.9, loss=5.49
Train: wpb=1000, num_updates=14000, accuracy=24.4, loss=5.43
Train: wpb=1000, num_updates=15000, accu

In [17]:
# 'El Periodico' test dataset
valid_x_df = pd.read_csv(f'{COMPETITION_ROOT}/x_test.csv')
test_x = valid_x_df[tokens].apply(vocab.get_index).to_numpy(dtype='int32')
y_pred = validate(model, None, test_x, None, params.batch_size, device)
y_token = [vocab.idx2token[index] for index in y_pred]

In [18]:
submission = pd.DataFrame({'id':valid_x_df['id'], 'token': y_token}, columns=['id', 'token'])
print(submission.head())
submission.to_csv('submission.csv', index=False)

   id      token
0   0      <unk>
1   1      <unk>
2   2         de
3   3      <unk>
4   4  Assemblea
