## 1. Import Libraries/ Dataset

In [None]:
#from google.colab import drive

# mount google drive
#drive.mount('./drive')

In [None]:
import pandas as pd

# set filepath
filepath='./drive/MyDrive/Datasets/IMDB Dataset.csv'

# load csv file in dataframe
df = pd.read_csv(filepath)
df.sentiment.value_counts()
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## 2. Data Visualization

In [37]:
x_data = df['review']       # Reviews/Input
y_data = df['sentiment']    # Sentiment/Output

print('X shape: ', x_data.shape)
print('Y shape: ', y_data.shape)

X shape:  (50000,)
Y shape:  (50000,)


## 3. Data Pre-processing

In [38]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# pre-process review
english_stops = set(stopwords.words('english')) # declare stop words
x_data = x_data.replace({'<.*?>': ''}, regex = True) # remove html tag
x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True) # remove non alphabet
x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops]) # remove stop words
x_data = x_data.apply(lambda review: [w.lower() for w in review]) # lower case

# encode sentiment
y_data = y_data.replace('positive', 1)
y_data = y_data.replace('negative', 0)

# print dataset
print('\n', 'Reviews')
print(x_data)
print('\n', 'Sentiment')
print(y_data)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!





 Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object

 Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [39]:
from sklearn.model_selection import train_test_split

# split dataset
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.3)

# print training set
print('Train Set')
print(x_train, '\n')
print(y_train, '\n')

# print test set
print('Test Set')
print(x_test, '\n')
print(y_test)

Train Set
2061     [this, film, neither, funny, whole, even, wort...
45047    [the, original, demille, movie, made, frederic...
4314     [while, soundtrack, bit, dated, story, relevan...
1820     [this, story, dedicated, women, according, int...
48816    [the, author, nekromantik, j, rg, buttgereit, ...
                               ...                        
9723     [house, games, wonderful, movie, multiple, lev...
24619    [big, splashy, film, broadway, music, nathan, ...
13320    [i, american, meatballs, still, really, hits, ...
49499    [i, got, hold, film, dvd, title, evil, never, ...
45858    [completely, ridiculous, period, film, thin, e...
Name: review, Length: 35000, dtype: object 

2061     0
45047    1
4314     1
1820     0
48816    1
        ..
9723     1
24619    1
13320    1
49499    0
45858    0
Name: sentiment, Length: 35000, dtype: int64 

Test Set
24651    [the, accounts, seem, real, human, factor, add...
46443    [i, actually, like, asylum, movies, i, made, h...
2

In [40]:
import numpy as np # linear algebra
from tensorflow.keras.preprocessing.text import Tokenizer # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences # to do padding or truncating

# function for getting maximum review length
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

# encode review
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[    8     4  1017 ...     0     0     0]
 [    2   127  7163 ...     0     0     0]
 [  381   665   131 ...  3619    49 13855]
 ...
 [    1   180 14521 ...     0     0     0]
 [    1   101   947 ...     0     0     0]
 [  238   556   721 ...     0     0     0]] 

Encoded X Test
 [[    2  5462   217 ...     0     0     0]
 [    1    75     6 ...     0     0     0]
 [    8    70     3 ...     0     0     0]
 ...
 [    2   198     3 ...    20   185   321]
 [    2  3796   252 ...   353   432  2560]
 [  762  9642 16649 ...  2758   935   276]] 

Maximum review length:  130


## 4. Model Building

In [45]:
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras

# impot pre-trained embedding layer
hub_layer = hub.KerasLayer("https://tfhub.dev/google/nnlm-en-dim128/2",
                           input_shape=[], dtype=tf.string)

# build model
model = keras.Sequential()
model.add(hub_layer)
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.summary()









Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_6 (KerasLayer)   (None, 128)               124642688 
_________________________________________________________________
dense_8 (Dense)              (None, 16)                2064      
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 17        
Total params: 124,644,769
Trainable params: 2,081
Non-trainable params: 124,642,688
_________________________________________________________________


## 5. Model Compilation

In [42]:
# compile model

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

## 6. Model Training

In [43]:
from tensorflow.keras.callbacks import ModelCheckpoint   # save model

# checkpoint callback
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

# train model
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5




ValueError: ignored

In [None]:
import matplotlib.pyplot as plt

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

# Accuracy history graph for training and validation
plt.plot(epochs, acc, 'r', label='Training accuracy')
plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend(loc=0)
plt.figure()

# Loss history graph for training and validation
plt.plot(epochs, loss, 'r', label='Training Loss')
plt.plot(epochs, val_loss, 'b', label='Validation Loss')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(loc=0)
plt.figure()

# show plot
plt.show()

## 7. Model Evaluation