In [1]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import Model
from tensorflow.keras.layers import *

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,KFold

from gensim.models import Word2Vec
from tqdm.cli import tqdm
from concurrent.futures import ThreadPoolExecutor

%matplotlib inline

In [2]:
df = pd.read_csv("./IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['sentiment'] = df['sentiment'].apply(lambda x:int(x == 'positive')).values

In [4]:
SYMBOL_FILTER = re.compile("[!,.\"\':?()]")
SPECIAL_CHAR_FILTER = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

df['review'] = df.review.apply(lambda x:SPECIAL_CHAR_FILTER.sub("",SYMBOL_FILTER.sub("",x)).lower())
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production the filming tech...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [5]:
sentences = [i.split(" ") for i in df.review]

In [6]:
vocab = []
for sent in sentences:
    vocab += sent
    
vocab = list(set(vocab))

In [7]:
w2v = Word2Vec(sentences,size=92).wv

In [189]:
dummy = np.zeros((92,))
max_len = max([len(s) for s in sentences])

def get_vector(w):
    if w:
        try:
            return w2v[w[0]]
        except:
            return dummy
    else:
        return dummy
    
def embed_sent(sent):
    sent, = sent
    return np.concatenate((np.apply_along_axis(vector,1,sent),np.zeros((max_len-len(sent),92))))

In [190]:
padded = np.array([ np.array(i).reshape(-1,1) for i in sentences[:32] ]).reshape(-1,1)

In [201]:
%%time
_ = np.apply_along_axis(embed_sent,1,padded)

CPU times: user 63.7 ms, sys: 28.8 ms, total: 92.6 ms
Wall time: 90.1 ms


In [220]:
class Dataset:
    def __init__(self,x,y,max_len=max_len,vector_size=92,shuffle=True):
        self.x = np.array([np.array(i).reshape(-1,1) for i in x]).reshape(-1,1)
        self.y = y
        
        self.shuffle = shuffle
        self.max_len = max_len
        self.vector_size = vector_size
        
    def __len__(self,):
        return len(self.x)
            
    def _flow(self,):
        br = len(self,) % self.batch_size
        while True:
            idx = np.random.permutation(len(self)) if self.shuffle else np.arange(0,len(self))
            for batch in idx[:-br].reshape(-1,self.batch_size):
                yield np.apply_along_axis(embed_sent,1,self.x[batch]),self.y[batch]                    
            yield np.apply_along_axis(embed_sent,1,self.x[idx[-br:]]),self.y[idx[-br:]]
                    
    
    def get_flow(self,batch_size):
        self.batch_size = batch_size
        return self._flow(),np.round(len(self)/batch_size).astype(int)

In [255]:
X,x,Y,y = train_test_split(np.array(sentences),df.sentiment.values)

train_ds = Dataset(X,Y)
train_flow,train_spe = train_ds.get_flow(32)

test_ds = Dataset(X,Y)
test_flow,test_spe = test_ds.get_flow(32)

In [256]:
in_tensor = Input(shape=(2450,92))

d1 = TimeDistributed(Dense(64,))(in_tensor)
d1 = BatchNormalization()(d1)
d1 = ReLU(6.)(d1)
d1 = GlobalAveragePooling1D()(d1)
d1 = Dropout(0.3)(d1)

out = Dense(1,)(d1)
out = BatchNormalization()(out)
out = Activation('sigmoid')(out)

model = Model(in_tensor,out)

In [257]:
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 2450, 92)]        0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 2450, 64)          5952      
_________________________________________________________________
batch_normalization_4 (Batch (None, 2450, 64)          256       
_________________________________________________________________
re_lu_2 (ReLU)               (None, 2450, 64)          0         
_________________________________________________________________
global_average_pooling1d_2 ( (None, 64)                0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65  

In [258]:
loss = tf.keras.losses.BinaryCrossentropy()
opt = tf.keras.optimizers.Adam()

model.compile(loss=loss,optmizer=opt,metrics=['accuracy'])

In [None]:
model.fit_generator(
    generator=train_flow,
    steps_per_epoch=train_spe,
    validation_data=test_flow,
    validation_steps=test_spe,
    epochs=3
)

Epoch 1/3