In [1]:
import comet_ml
import os
import json
import multiprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm, tqdm_notebook
import keras
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from ml_util.monitor.comet_ml_exp import initialize_comet_ml_experiment
from basic_transformer.models.basic_transformer import BasicTransformer
from basic_transformer import utils as local_util
RANDOM_SEED = 43
np.random.seed(RANDOM_SEED)

Using TensorFlow backend.


In [2]:
DATASET = 'yelp'  # 'yelp' or 'imdb'
DIM = 128
NUM_WORDS = 5_000
TEXT_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'
LABEL_MAPPING = {'negative': 0, 'positive': 1}
MAX_SEQ_LEN = 128
# dataloaders
BATCH_SIZE = 16
SHUFFLE = True
DEBUG_DF = False

In [3]:
# get model
model = BasicTransformer(dim=DIM, num_embeddings=NUM_WORDS, embedding_dim=DIM)

In [4]:
if DATASET == 'yelp':
    df = pd.read_csv("/media/can/MyData/datasets/yelp/df.csv")
    df['sentiment'] = df['stars'].replace({1: 'negative', 5: 'positive'})
    df = df.rename(columns={'text': 'review'})
    df = df.sample(n=50_000, random_state=RANDOM_SEED)
elif DATASET == 'imdb':
    df = pd.read_csv("/media/can/MyData/datasets/imdb-50k-movie-review/IMDB Dataset.csv")
else:
    raise ValueError("Invalid data: {}".format(str(DATASET)))

In [5]:
txts_positive = df[df['sentiment'] == 'positive']['review'].sample(3).tolist()
txts_negative = df[df['sentiment'] == 'negative']['review'].sample(3).tolist()

In [6]:
lens = df['review'].apply(lambda x: len(x.split(' ')))
lens.mean()

116.26696

In [7]:
# generate a fake df for debugging
if DEBUG_DF:
    n_positive, n_negative = 5000, 5000
    positive_label, negative_label = 'positive', 'negative'
    positive_text = ' '.join(['good'] * 10)
    negative_text = ' '.join(['bad'] * 10)
    df = [(positive_text, positive_label)] * n_positive + [(negative_text, negative_label)] * n_negative
    df = pd.DataFrame(df)
    df.columns = ('review', 'sentiment')
    df

In [8]:
datagen = local_util.dataset_generator.TextDataset(df=df, 
                                                   num_words=NUM_WORDS, 
                                                   text_column=TEXT_COLUMN, 
                                                   label_column=LABEL_COLUMN, 
                                                   label_mapping=LABEL_MAPPING, 
                                                   max_seq_len=MAX_SEQ_LEN)

In [9]:
dataloader = DataLoader(dataset=datagen,  
                        batch_size=BATCH_SIZE, 
                        num_workers=multiprocessing.cpu_count(), 
                        shuffle=SHUFFLE)

In [10]:
model.cuda()

BasicTransformer(
  (embed_layer): Embedding(5001, 128)
  (linear): Linear(in_features=128, out_features=128, bias=True)
  (linear_clf): Linear(in_features=128, out_features=1, bias=True)
)

In [11]:
# loss and optimizer
criterion = nn.BCELoss()
# optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(model.parameters())

In [12]:
# initialize comet experiment
COMET_ML_EXPERIMENT, \
COMET_ML_EXPERIMENT_LINK = initialize_comet_ml_experiment(model_name=local_util.config.PROJECT_NAME, 
                                                          model_version='x.0.0')

COMET INFO: old comet version (2.0.11) detected. current: 2.0.13 please update your comet lib with command: `pip install --no-cache-dir --upgrade comet_ml`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/a-c-ozbek/basic-transformer-clf-from-scratch/7da3431c065a48cfad4b9e06bf60202a

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


[1,   200] loss: 0.696
[1,   400] loss: 0.646
[1,   600] loss: 0.583
[1,   800] loss: 0.577
[1,  1000] loss: 0.552
[1,  1200] loss: 0.517
[1,  1400] loss: 0.526
[1,  1600] loss: 0.513
[1,  1800] loss: 0.504
[1,  2000] loss: 0.478
[1,  2200] loss: 0.483
[1,  2400] loss: 0.483
[1,  2600] loss: 0.490
[1,  2800] loss: 0.473
[1,  3000] loss: 0.466

[2,   200] loss: 0.719
[2,   400] loss: 0.460
[2,   600] loss: 0.426
[2,   800] loss: 0.428
[2,  1000] loss: 0.407
[2,  1200] loss: 0.426
[2,  1400] loss: 0.412
[2,  1600] loss: 0.399
[2,  1800] loss: 0.392
[2,  2000] loss: 0.421
[2,  2200] loss: 0.418
[2,  2400] loss: 0.413
[2,  2600] loss: 0.409
[2,  2800] loss: 0.408
[2,  3000] loss: 0.420



In [13]:
losses = list()
running_loss = 0.0
every = 200
# with COMET_ML_EXPERIMENT.train():
for epoch in range(2):
    for i, x in tqdm_notebook(enumerate(dataloader), total=len(dataloader)):
        inputs = x['seq']
        labels = x['label'].float()

        inputs = inputs.cuda()
        labels = labels.cuda()

        #
        optimizer.zero_grad()

        # 
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

#             if i % 2 == 0:
#                 print('loss:', loss)

        # print statistics
        running_loss += loss.item()
        if i % every == every - 1:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / every))
            running_loss = 0.0

#             COMET_ML_EXPERIMENT.log_metric("loss", loss.item())

HBox(children=(IntProgress(value=0, max=3125), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3125), HTML(value='')))

In [None]:
import numpy as np
import torch

In [None]:
emb_layer = torch.nn.Embedding(num_embeddings=17, embedding_dim=8)

In [None]:
# emb_layer(torch.tensor([0, 0, 10, 16, 17]))