In [1]:
import comet_ml
import os
import json
import multiprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm, tqdm_notebook
import keras
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from ml_util.monitor.comet_ml_exp import initialize_comet_ml_experiment
from basic_transformer.models.basic_transformer import BasicTransformer
from basic_transformer import utils as local_util
RANDOM_SEED = 43
np.random.seed(RANDOM_SEED)

Using TensorFlow backend.


In [2]:
MODEL_VERSION = 'x.0.0'
DATASET = 'yelp'  # 'yelp' or 'imdb'
N_SAMPLE_YELP = 10_000
DIM = 128
NUM_WORDS = 50_000
TEXT_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'
LABEL_MAPPING = {'negative': 0, 'positive': 1}
MAX_SEQ_LEN = 128
# dataloaders
BATCH_SIZE = 16
SHUFFLE = True
DEBUG_DF = False

In [3]:
# get model
model = BasicTransformer(dim=DIM, num_embeddings=NUM_WORDS, embedding_dim=DIM)

In [4]:
if DATASET == 'yelp':
    df = pd.read_csv("/media/can/datasets/yelp/df.csv")
    df['sentiment'] = df['stars'].replace({1: 'negative', 5: 'positive'})
    df = df.rename(columns={'text': 'review'})
    df = df.sample(n=N_SAMPLE_YELP, random_state=RANDOM_SEED)
elif DATASET == 'imdb':
    df = pd.read_csv("/media/can/datasets/imdb-50k-movie-review/IMDB Dataset.csv")
else:
    raise ValueError("Invalid data: {}".format(str(DATASET)))

In [5]:
txts_positive = df[df['sentiment'] == 'positive']['review'].sample(3).tolist()
txts_negative = df[df['sentiment'] == 'negative']['review'].sample(3).tolist()

In [6]:
lens = df['review'].apply(lambda x: len(x.split(' ')))
lens.mean()

116.053

In [7]:
# generate a fake df for debugging
if DEBUG_DF:
    n_positive, n_negative = 5000, 5000
    positive_label, negative_label = 'positive', 'negative'
    positive_text = ' '.join(['good'] * 10)
    negative_text = ' '.join(['bad'] * 10)
    df = [(positive_text, positive_label)] * n_positive + [(negative_text, negative_label)] * n_negative
    df = pd.DataFrame(df)
    df.columns = ('review', 'sentiment')
    df

In [8]:
datagen = local_util.dataset_generator.TextDataset(df=df, 
                                                   num_words=NUM_WORDS, 
                                                   text_column=TEXT_COLUMN, 
                                                   label_column=LABEL_COLUMN, 
                                                   label_mapping=LABEL_MAPPING, 
                                                   max_seq_len=MAX_SEQ_LEN)

In [None]:
dataloader = DataLoader(dataset=datagen,  
                        batch_size=BATCH_SIZE, 
                        num_workers=multiprocessing.cpu_count(), 
                        shuffle=SHUFFLE)

In [None]:
model.cuda()

BasicTransformer(
  (embed_layer): Embedding(50001, 128)
  (linear): Linear(in_features=128, out_features=128, bias=True)
  (linear_clf): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
# loss and optimizer
criterion = nn.BCELoss()
# optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(model.parameters())

In [None]:
# initialize comet experiment
COMET_ML_EXPERIMENT, \
COMET_ML_EXPERIMENT_LINK = initialize_comet_ml_experiment(model_name=local_util.config.PROJECT_NAME, 
                                                          model_version=MODEL_VERSION)

COMET INFO: old comet version (2.0.11) detected. current: 2.0.13 please update your comet lib with command: `pip install --no-cache-dir --upgrade comet_ml`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/a-c-ozbek/basic-transformer-clf-from-scratch/23f0aff35daa4de8b8131d96cdbf0c25

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


In [None]:
losses = list()
running_loss = 0.0
every = 200
with COMET_ML_EXPERIMENT.train():
    for epoch in range(1):
        for i, x in tqdm_notebook(enumerate(dataloader), total=len(dataloader)):
            inputs = x['seq']
            labels = x['label'].float()

            inputs = inputs.cuda()
            labels = labels.cuda()

            #
            optimizer.zero_grad()

            # 
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            # acc
            y_true = labels.cpu().numpy().astype(np.int)
            y_pred = (outputs > 0.5).cpu().numpy().astype(np.int).squeeze()
            acc = np.mean(y_true == y_pred)
            
            # log
            COMET_ML_EXPERIMENT.log_metric("batch_loss", loss)
            COMET_ML_EXPERIMENT.log_metric("batch_acc", acc)

HBox(children=(IntProgress(value=0, max=625), HTML(value='')))