In [1]:
import time
import multiprocessing
import numpy as np
import random

random.seed(0)
np.random.seed(0)

from lib.linear_algebra import Matrix
from lib.nn import Linear, ReLU, NN, Sigmoid, Embedding, Flatten, MaxPooling
from lib.tokenization import BPETokenizer
from lib.io import load_tokenizer, save_tokenizer

In [2]:
# IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
# This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing.
tokenizer_data = []
read_header = False
assert len("negative") == len("positive")
label_len = len("negative") + 1
with open("data/imdb_dataset.csv", "rt") as f:
    for line in f.readlines():
        if not read_header:
            read_header = True
            continue
        tokenizer_data.append([line[:-label_len - 1], line[-label_len:-1]])


In [3]:
# tokenizer = BPETokenizer(3000)
# tokenizer.fit([d[0] for d in tokenizer_data])
# save_tokenizer(tokenizer, "imdb_tokenizer_3K")

tokenizer = load_tokenizer("imdb_tokenizer_3K")

In [4]:
# Flipkart Customer Review and Rating
# This Dataset consists of reviews by customers on boAt Rockerz 400

data = []
read_header = False
with open("data/flipkart_reviews.csv", "rt") as f:
    for line in f.readlines():
        if not read_header:
            read_header = True
            continue
        data.append([line[:-12], int(int(line[-2])>=4)])

In [5]:
data = [d for d in data if d[1]==1][:500] + [d for d in data if d[1]==0][:500]

In [6]:
with multiprocessing.Pool() as pool:
    tokens_list = pool.map(tokenizer.encode, [d[0] for d in data])

In [7]:
labels = [d[1] for d in data]

In [8]:
max_len = 32
for tokens in tokens_list:
    del tokens[max_len:]

In [9]:
ill_formatted_indices = [] 
for i, tokens in enumerate(tokens_list):
    if any(isinstance(t, str) for t in tokens):
        ill_formatted_indices.append(i)

tokens_list = [t for i, t in enumerate(tokens_list) if i not in ill_formatted_indices]
labels = [l for i, l in enumerate(labels) if i not in ill_formatted_indices]

In [10]:
max_len = max([len(tokens) for tokens in tokens_list])
for tokens in tokens_list:
    tokens.extend([0] * (max_len - len(tokens)))

In [11]:
indices = random.sample(range(len(tokens_list)), int(len(tokens_list) * 0.8))
X_train, y_train = Matrix([t for i, t in enumerate(tokens_list) if i in indices]), Matrix([[l] for i, l in enumerate(labels) if i in indices])
X_test, y_test = Matrix([t for i, t in enumerate(tokens_list) if i not in indices]), Matrix([[l] for i, l in enumerate(labels) if i not in indices])
X_train.dims(), y_train.dims(), X_test.dims(), y_test.dims()

((771, 32), (771, 1), (193, 32), (193, 1))

In [12]:
vocab_size = len(tokenizer.vocab)
emb_size = 8

size_dim, seq_len = X_train.dims()

nn = NN(
    [
        Embedding(vocab_size, emb_size),
        Flatten(),
        Linear(seq_len * emb_size, 16, "uniform_glorot"),
        ReLU(),
        Linear(16, 16, "uniform_glorot"),
        ReLU(),
        Linear(16, 1, "uniform_glorot"),
        Sigmoid(),
    ]
)

In [13]:
EPSILON = 1e-5
time_point = time.time()

for i in range(10):
    out = nn(X_train)
    loss = -(y_train * out.ln() + (1-y_train) * (1-out).ln()).row_sum().col_sum()[0] / y_train.dims()[0]

    elapsed_time = int(time.time() - time_point)
    print(f"{i} | {loss.data:.2f} | {elapsed_time}s")
    time_point = time.time()

    for p in nn.params():
        p.zero_grad()
    loss.grad = 1
    loss.backward()

    for p in nn.params():
        for v in p.all_values():
            v.data -= 0.1 * v.grad 

0 | 0.76 | 138s
1 | 0.69 | 544s
2 | 0.67 | 790s
3 | 0.66 | 746s
4 | 0.65 | 841s
5 | 0.64 | 685s
6 | 0.64 | 734s
7 | 0.63 | 811s
8 | 0.63 | 1163s
9 | 0.62 | 986s


In [14]:
out = nn(X_test) 
loss = -(y_test * out.ln() + (1-y_test) * (1-out).ln()).row_sum().col_sum()[0] / y_test.dims()[0]
loss

{aab0614a, 0.64, 0}

In [15]:
accuracy = sum([round(o[0].data) == yt[0].data for o, yt in zip(out, y_test)])/y_test.dims()[0]
accuracy

0.5906735751295337

In [36]:
examples = [(tokenizer.decode([t.data for t in xt]), round(o[0].data, 2), yt[0].data) for o, yt, xt in zip(out, y_test, X_test)]
examples[:5] + examples[-5:]


[('great looks n very light weight sound is very cool but design is ok ok. t',
  np.float64(0.34),
  1),
 ('light weight and easy to carry anywhere and attractive model awesome service from flip',
  np.float64(0.57),
  1),
 ('"The boat product is good excellent headphone to buy with Bluetooth without ',
  np.float64(0.54),
  1),
 ('Product is good battery backup is also good. I recommend you should bu',
  np.float64(0.63),
  1),
 ('"Firstly, Thanks to Flipkart...I got the headphones 4 days earlier...',
  np.float64(0.6),
  1),
 ('Best<SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR>',
  np.float64(0.15),
  0),
 ('Its too painfull on ears<SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARATOR><SEPARAT