In [1]:
USE_FAST = True

In [2]:
import time
import multiprocessing
import numpy as np
import random

random.seed(0)
np.random.seed(0)
if USE_FAST:
    from lib.fast.linear_algebra import Matrix
    from lib.fast.nn import Linear, ReLU, NN, Sigmoid, Embedding, Flatten, MaxPooling
else:
    from lib.original.linear_algebra import Matrix
    from lib.original.nn import Linear, ReLU, NN, Sigmoid, Embedding, Flatten, MaxPooling
from lib.tokenization import BPETokenizer
from lib.io import load_tokenizer, save_tokenizer
from lib.metrics.losses import binary_cross_entropy
from lib.metrics.evaluations import accuracy
from lib.gd_data_loaders import BatchDataLoader, StochasticDataLoader, MiniBatchDataLoader
from lib.optimizers import SgdOptimizer, SgdWithMomentumOptimizer, AdaGradOptimizer, RmsPropOptimizer, AdamOptimizer

In [3]:
# IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
# This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing.
tokenizer_data = []
read_header = False
assert len("negative") == len("positive")
label_len = len("negative") + 1
with open("data/imdb_dataset.csv", "rt") as f:
    for line in f.readlines():
        if not read_header:
            read_header = True
            continue
        tokenizer_data.append([line[:-label_len - 1], line[-label_len:-1]])


In [4]:
# tokenizer = BPETokenizer(3000)
# tokenizer.fit([d[0] for d in tokenizer_data])
# save_tokenizer(tokenizer, "imdb_tokenizer_3K")

tokenizer = load_tokenizer("imdb_tokenizer_3K")

In [5]:
# Flipkart Customer Review and Rating
# This Dataset consists of reviews by customers on boAt Rockerz 400

data = []
read_header = False
with open("data/flipkart_reviews.csv", "rt") as f:
    for line in f.readlines():
        if not read_header:
            read_header = True
            continue
        data.append([line[:-12], int(int(line[-2])>=4)])

In [6]:
data = [d for d in data if d[1]==1][:500] + [d for d in data if d[1]==0][:500]

In [7]:
with multiprocessing.Pool() as pool:
    tokens_list = pool.map(tokenizer.encode, [d[0] for d in data])

In [8]:
labels = [d[1] for d in data]

In [9]:
max_len = 32
for tokens in tokens_list:
    del tokens[max_len:]

In [10]:
ill_formatted_indices = [] 
for i, tokens in enumerate(tokens_list):
    if any(isinstance(t, str) for t in tokens):
        ill_formatted_indices.append(i)

tokens_list = [t for i, t in enumerate(tokens_list) if i not in ill_formatted_indices]
labels = [l for i, l in enumerate(labels) if i not in ill_formatted_indices]

In [11]:
max_len = max([len(tokens) for tokens in tokens_list])
for tokens in tokens_list:
    tokens.extend([0] * (max_len - len(tokens)))

In [12]:
indices = random.sample(range(len(tokens_list)), int(len(tokens_list) * 0.8))
X_train, y_train = Matrix([t for i, t in enumerate(tokens_list) if i in indices]), Matrix([[l] for i, l in enumerate(labels) if i in indices])
X_test, y_test = Matrix([t for i, t in enumerate(tokens_list) if i not in indices]), Matrix([[l] for i, l in enumerate(labels) if i not in indices])
X_train.dims(), y_train.dims(), X_test.dims(), y_test.dims()

((771, 32), (771, 1), (193, 32), (193, 1))

In [13]:
vocab_size = len(tokenizer.vocab)
emb_size = 8

size_dim, seq_len = X_train.dims()

def init_nn():
    return NN(
        [
            Embedding(vocab_size, emb_size),
            Flatten(),
            Linear(seq_len * emb_size, 16, "uniform_glorot"),
            ReLU(),
            Linear(16, 16, "uniform_glorot"),
            ReLU(),
            Linear(16, 1, "uniform_glorot"),
            Sigmoid(),
        ]
    )

In [14]:
EPSILON = 1e-5
time_point = time.time()

data_loaders = [
    # BatchDataLoader(X_train, y_train),
    # StochasticDataLoader(X_train, y_train),
    MiniBatchDataLoader(X_train, y_train, 16)
]
optimizer_creators = [
    # lambda nn: SgdOptimizer(nn, 0.001),
    # lambda nn: SgdWithMomentumOptimizer(nn, 0.001, 0.9),
    # lambda nn: AdaGradOptimizer(nn, 0.001),
    lambda nn: RmsPropOptimizer(nn, 0.001, 0.95),
    lambda nn: AdamOptimizer(nn, 0.005, 0.95, 0.99),
]

for data_loader in data_loaders:
    for optimizer_creator in optimizer_creators:
        nn = init_nn()
        optimizer = optimizer_creator(nn)
        print(f"gradient descent: {data_loader.__class__} | optimizer: {optimizer.__class__}")
        for i in range(10):
            X_b, y_b = data_loader.get_batch()
            out = nn(X_b)
            loss = binary_cross_entropy(y_b, out)
        
            elapsed_time = int(time.time() - time_point)
            print(f"{i} | {loss.data:.2f} | {elapsed_time}s")
            time_point = time.time()

            optimizer.step(loss)
                
        train_out = nn(X_train) 
        train_loss = binary_cross_entropy(y_train, train_out)
        test_out = nn(X_test) 
        test_loss = binary_cross_entropy(y_test, test_out)
        test_acc = accuracy(y_test, test_out)
        print(f"train loss: {train_loss.data:.2f}   test loss: {test_loss.data:.2f}   test accuracy: {test_acc:.2f}") 
                

gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.optimizers.RmsPropOptimizer'>
0 | 0.78 | 1s
1 | 0.73 | 3s
2 | 0.67 | 3s
3 | 0.78 | 3s
4 | 0.74 | 4s
5 | 0.62 | 3s
6 | 0.64 | 3s
7 | 0.66 | 3s
8 | 0.61 | 4s
9 | 0.73 | 3s
train loss: 0.63   test loss: 0.66   test accuracy: 0.58
gradient descent: <class 'lib.gd_data_loaders.MiniBatchDataLoader'> | optimizer: <class 'lib.optimizers.AdamOptimizer'>
0 | 0.86 | 190s
1 | 0.89 | 1s
2 | 0.84 | 1s
3 | 0.92 | 1s
4 | 0.76 | 1s
5 | 0.74 | 1s
6 | 0.89 | 1s
7 | 0.74 | 1s
8 | 0.73 | 1s
9 | 0.74 | 1s
train loss: 0.64   test loss: 0.69   test accuracy: 0.54


In [16]:
examples = [(tokenizer.decode([int(t.data) for t in xt]), round(o[0].data, 2), yt[0].data) for o, yt, xt in zip(out, y_test, X_test)]
examples[:5] + examples[-5:]


[('great looks n very light weight sound is very cool but design is ok ok. t',
  0.62,
  1.0),
 ('light weight and easy to carry anywhere and attractive model awesome service from flip',
  0.71,
  1.0),
 ('"The boat product is good excellent headphone to buy with Bluetooth without ',
  0.29,
  1.0),
 ('Product is good battery backup is also good. I recommend you should bu',
  0.48,
  1.0),
 ('"Firstly, Thanks to Flipkart...I got the headphones 4 days earlier...',
  0.8,
  1.0),
 ('"Got this headphone at just rs. 899 in offer i can say it\'s just a',
  0.46,
  1.0),
 ('"Battery Backup Is ""Ashvatthama"" Which Just Denies To Die , St',
  0.27,
  1.0),
 ('sound quality good enough considering that I got it for 1000. some might find ',
  0.46,
  1.0),
 ('Excellent jst awesome product... Love it at 999 it is best<SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR>',
  0.33,
  1.0),
 ('awesome product crystal clear sound with superb bass. I buy these for RS 999 ',
  0.51,
  1.0