In [None]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import *

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,KFold

from gensim.models import Word2Vec
from tqdm.cli import tqdm
from concurrent.futures import ThreadPoolExecutor

%matplotlib inline

In [None]:
df = pd.read_csv("./IMDB Dataset.csv")
df.head()

In [None]:
df['sentiment'] = df['sentiment'].apply(lambda x:int(x == 'positive')).values

In [None]:
SYMBOL_FILTER = re.compile("[!,.\"\':?()]")
SPECIAL_CHAR_FILTER = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

df['review'] = df.review.apply(lambda x:SPECIAL_CHAR_FILTER.sub("",SYMBOL_FILTER.sub("",x)).lower())
df.head()

In [None]:
sentences = [i.split(" ") for i in df.review]

In [None]:
vocab = []
for sent in sentences:
    vocab += sent
    
vocab = list(set(vocab))

In [None]:
w2v = Word2Vec(sentences,size=48).wv

In [None]:
w2v.most_similar("hour")

In [None]:
dummy = np.zeros((48,))
max_len = max([len(s) for s in sentences])

def get_vector(w):
    if w:
        try:
            return w2v[w[0]]
        except:
            return dummy
    else:
        return dummy
    
def embed_sent(sent):
    sent, = sent
    return np.concatenate((np.apply_along_axis(get_vector,1,sent),np.zeros((max_len-len(sent),48))))

In [None]:
class Dataset:
    def __init__(self,x,y,max_len=max_len,vector_size=48,shuffle=True):
        self.x = np.array([np.array(i).reshape(-1,1) for i in x]).reshape(-1,1)
        self.y = y
        
        self.shuffle = shuffle
        self.max_len = max_len
        self.vector_size = vector_size
        
    def __len__(self,):
        return len(self.x)
            
    def _flow(self,):
        br = len(self,) % self.batch_size
        while True:
            idx = np.random.permutation(len(self)) if self.shuffle else np.arange(0,len(self))
            for batch in idx[:-br].reshape(-1,self.batch_size):
                yield np.apply_along_axis(embed_sent,1,self.x[batch]),self.y[batch]                    
            yield np.apply_along_axis(embed_sent,1,self.x[idx[-br:]]),self.y[idx[-br:]]
                    
    
    def get_flow(self,batch_size):
        self.batch_size = batch_size
        return self._flow(),np.round(len(self)/batch_size).astype(int)

In [None]:
X,x,Y,y = train_test_split(np.array(sentences),df.sentiment.values)

train_ds = Dataset(X,Y)
test_ds = Dataset(x,y)

In [None]:
in_tensor = Input(shape=(2450,48))

d1 = MaxPool1D()(in_tensor)
d1 = TimeDistributed(Dense(48,))(d1)
d1 = BatchNormalization()(d1)
d1 = LeakyReLU(0.3)(d1)
d1 = Dropout(0.3)(d1)

d1 = MaxPool1D()(d1)
d1 = TimeDistributed(Dense(96,))(d1)
d1 = BatchNormalization()(d1)
d1 = LeakyReLU(0.3)(d1)
d1 = Dropout(0.3)(d1)

d1 = MaxPool1D()(d1)
d1 = TimeDistributed(Dense(192,))(d1)
d1 = BatchNormalization()(d1)
d1 = LeakyReLU(0.3)(d1)
d1 = Dropout(0.3)(d1)

d1 = GlobalAveragePooling1D()(d1)

out = Dense(1,)(d1)
out = BatchNormalization()(out)
out = Activation('sigmoid')(out)

model = Model(in_tensor,out)

In [None]:
model.summary()

In [None]:
loss = tf.keras.losses.BinaryCrossentropy()
opt = tf.keras.optimizers.Adam()

model.compile(loss=loss,optmizer=opt,metrics=['accuracy'])

In [None]:
train_flow,train_spe = train_ds.get_flow(8)
test_flow,test_spe = test_ds.get_flow(8)

In [None]:
model.fit_generator(
    generator=train_flow,
    steps_per_epoch=train_spe,
    validation_data=test_flow,
    validation_steps=test_spe,
    epochs=3
)