In [102]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import *

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,KFold

from gensim.models import Word2Vec
from tqdm.cli import tqdm
from concurrent.futures import ThreadPoolExecutor

%matplotlib inline

In [2]:
df = pd.read_csv("./IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['sentiment'] = df['sentiment'].apply(lambda x:int(x == 'positive')).values

In [4]:
SYMBOL_FILTER = re.compile("[!,.\"\':?()]")
SPECIAL_CHAR_FILTER = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

df['review'] = df.review.apply(lambda x:SPECIAL_CHAR_FILTER.sub("",SYMBOL_FILTER.sub("",x)).lower())
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [5]:
sentences = [i.split(" ") for i in df.review]

In [20]:
vocab = []
for sent in tqdm(sentences):
    vocab += sent
    
vocab = list(set(vocab))


  0%|          | 0/50000 [00:00<?, ?it/s][A
 45%|████▍     | 22367/50000 [00:00<00:00, 223613.70it/s][A
100%|██████████| 50000/50000 [00:00<00:00, 242443.92it/s][A


In [24]:
w2v = Word2Vec(sentences,size=92).wv

In [91]:
dummy = np.zeros((92,))
max_len = max([len(s) for s in sentences])

def vector(w):
    try:
        return w2v[w]
    except:
        return dummy

def pad(sent):
    return np.concatenate((
        np.array([ vector(w) for w in  sent]),
        np.zeros((max_len - len(sent),92))
    ))

In [121]:
class Dataset:
    def __init__(self,x,y,max_len=max_len,vector_size=92,shuffle=True):
        self.x = x
        self.y = y
        
        self.shuffle = shuffle
        self.max_len = max_len
        self.vector_size = vector_size
        
    def __len__(self,):
        return len(self.x)
        
    def _set_vect(self,args):
        idx,sent = args
        self.dummy[idx] = pad(sent) 
        return 1
    
    def _flow(self,):
        br = len(self,) % self.batch_size
        self.dummy = np.zeros((self.batch_size,self.max_len,self.vector_size))
        with ThreadPoolExecutor(max_workers=int(self.batch_size*1.5)) as executer:
            while True:
                idx = np.random.permutation(len(self)) if self.shuffle else np.arange(0,len(self))
                for batch in idx[:-br].reshape(-1,self.batch_size):
#                     _ = list(executer.map(self._set_vect,enumerate(self.x[batch])))
                    yield self.x[batch]
    
    def get_flow(self,batch_size):
        self.batch_size = batch_size
        return self._flow(),np.round(len(self)/batch_size)

In [122]:
ds = Dataset(sentences,df.sentiment)

In [127]:
fl,spe = ds.get_flow(32)

In [133]:
[sent+(['<BLANK>']*(max_len-len(sent))) for sent in sentences[:10]]

array([['one', 'of', 'the', ..., '<BLANK>', '<BLANK>', '<BLANK>'],
       ['a', 'wonderful', 'little', ..., '<BLANK>', '<BLANK>', '<BLANK>'],
       ['i', 'thought', 'this', ..., '<BLANK>', '<BLANK>', '<BLANK>'],
       ...,
       ['this', 'show', 'was', ..., '<BLANK>', '<BLANK>', '<BLANK>'],
       ['encouraged', 'by', 'the', ..., '<BLANK>', '<BLANK>', '<BLANK>'],
       ['if', 'you', 'like', ..., '<BLANK>', '<BLANK>', '<BLANK>']],
      dtype='<U13')

In [54]:
X,x,Y,y = train_test_split(feat,df.sentiment.values)

In [95]:
in_tensor = Input(shape=(len(cvt.vocabulary_),))

d1 = Dense(8,)(in_tensor)
d1 = BatchNormalization()(d1)
d1 = ReLU(6.)(d1)
d1 = Dropout(0.3)(d1)

out = Dense(1,)(d1)
out = BatchNormalization()(out)
out = Activation('sigmoid')(out)

model = Model(in_tensor,out)

In [96]:
model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 220805)]          0         
_________________________________________________________________
dense_11 (Dense)             (None, 8)                 1766448   
_________________________________________________________________
batch_normalization_11 (Batc (None, 8)                 32        
_________________________________________________________________
re_lu_6 (ReLU)               (None, 8)                 0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 9         
_________________________________________________________________
batch_normalization_12 (Batc (None, 1)                 4   

In [97]:
loss = tf.keras.losses.BinaryCrossentropy()
opt = tf.keras.optimizers.Adam()

model.compile(loss=loss,optmizer=opt,metrics=['accuracy'])

In [98]:
batch_size = 64

train_flow = get_flow(X,Y,batch_size)
test_flow = get_flow(x,y,batch_size)

In [99]:
model.fit_generator(
    generator=train_flow,
    steps_per_epoch=np.round(len(X)/batch_size),
    validation_data=test_flow,
    validation_steps=np.round(len(x)/batch_size),
    epochs=3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f96501c9908>