In [1]:
import comet_ml
import os
import json
import multiprocessing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from ml_util.monitor.comet_ml_exp import initialize_comet_ml_experiment
from basic_transformer.models.basic_transformer import BasicTransformer
from basic_transformer import utils as local_util
RANDOM_SEED = 43
np.random.seed(RANDOM_SEED)

Using TensorFlow backend.


In [2]:
MODEL_VERSION = 'x.0.0'
DATASET = 'yelp'  # 'yelp' or 'imdb'
N_SAMPLE_YELP = 50_000
TEST_SIZE = 0.2
#
N_EPOCHS = 5
DIM = 128
NUM_WORDS = 5_000
MAX_SEQ_LEN = 128
#
TEXT_COLUMN = 'review'
LABEL_COLUMN = 'sentiment'
LABEL_MAPPING = {'negative': 0, 'positive': 1}
# dataloaders
BATCH_SIZE = 16
SHUFFLE = True
DEBUG_DF = False

# Get Data

In [3]:
if DATASET == 'yelp':
    df = pd.read_csv("/media/can/datasets/yelp/df.csv")
    df['sentiment'] = df['stars'].replace({1: 'negative', 5: 'positive'})
    df = df.rename(columns={'text': 'review'})
    df = df.sample(n=N_SAMPLE_YELP, random_state=RANDOM_SEED)
elif DATASET == 'imdb':
    df = pd.read_csv("/media/can/datasets/imdb-50k-movie-review/IMDB Dataset.csv")
else:
    raise ValueError("Invalid data: {}".format(str(DATASET)))

In [4]:
# train - test split 
df_train, df_test = train_test_split(df, test_size=TEST_SIZE)
del df
print("len(df_train):", len(df_train))
print("len(df_test):", len(df_test))

len(df_train): 40000
len(df_test): 10000


In [6]:
text_dataset_params = dict()
text_dataset_params['num_words'] = NUM_WORDS
text_dataset_params['text_column'] = TEXT_COLUMN
text_dataset_params['label_column'] = LABEL_COLUMN
text_dataset_params['label_mapping'] = LABEL_MAPPING
text_dataset_params['max_seq_len'] = MAX_SEQ_LEN

# datasets
datagen_train = local_util.dataset_generator.TextDataset(df=df_train, **text_dataset_params)
datagen_test = local_util.dataset_generator.TextDataset(df=df_test, **text_dataset_params)

In [None]:
dataloader_params = dict()
dataloader_params['batch_size'] = BATCH_SIZE
dataloader_params['num_workers'] = multiprocessing.cpu_count()
dataloader_params['shuffle'] = SHUFFLE

# dataloaders
dataloader_train = DataLoader(dataset=datagen_train, **dataloader_params)
dataloader_test = DataLoader(dataset=datagen_test, **dataloader_params)

# Train

In [None]:
# get model
model = BasicTransformer(dim=DIM, num_embeddings=NUM_WORDS, embedding_dim=DIM)

In [None]:
model.cuda()

In [None]:
# loss and optimizer
criterion = nn.BCELoss()
# optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.Adam(model.parameters())

In [None]:
# initialize comet experiment
COMET_ML_EXPERIMENT, \
COMET_ML_EXPERIMENT_LINK = initialize_comet_ml_experiment(model_name=local_util.config.PROJECT_NAME, 
                                                          model_version=MODEL_VERSION)

In [None]:
losses = list()
running_loss = 0.0
every = 200
with COMET_ML_EXPERIMENT.train():
    for epoch in range(N_EPOCHS):
        avg_container = 
        for i, x in tqdm_notebook(enumerate(dataloader), total=len(dataloader)):
            inputs = x['seq']
            labels = x['label'].float()

            inputs = inputs.cuda()
            labels = labels.cuda()

            #
            optimizer.zero_grad()

            # 
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            # acc
            y_true = labels.cpu().numpy().astype(np.int)
            y_pred = (outputs > 0.5).cpu().numpy().astype(np.int).squeeze()
            acc = np.mean(y_true == y_pred)
            
            # log
            COMET_ML_EXPERIMENT.log_metric("batch_loss", loss)
            COMET_ML_EXPERIMENT.log_metric("batch_acc", acc)

In [None]:
COMET_ML_EXPERIMENT.end()