# IMDB Review Sentiment Analysis

In [29]:
import tensorflow as tf

In [30]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [31]:
# uncomment if using colab
# !unzip data.zip

In [32]:
# uncomment if using colab
# !wget https://nlp.stanford.edu/data/glove.6B.zip

In [33]:
# uncomment if using colab
# !unzip glove.6B.zip -d glove.6B
# !cp glove.6B/glove.6B.100d.txt ./glove.6B.100d.txt

## Dataset and Exploration

In [34]:
import pandas as pd

In [35]:
df = pd.read_csv('./data/IMDB_Dataset.csv')

In [36]:
df.shape

(50000, 2)

In [37]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [38]:
df.loc[df['sentiment'] == 'positive', 'sentiment'] = 1
df.loc[df['sentiment'] == 'negative', 'sentiment'] = 0

In [39]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


## Preprocessing

In [40]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [41]:
def clean_strings(df):
    def remove_tags(text):
        tag_pattern = re.compile(r'<[^>]+>')
        return tag_pattern.sub('', text)
    
    def remove_stop_words(text):
        stop_pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
        return stop_pattern.sub('', text)

    corpus = []
    text_data = ''
    for i in tqdm(range(len(df.index)), ascii=False, ncols=100):
        
        # remove html tags (since this is webscraped)
        text_data = remove_tags(df['review'][i])

        # remove any non-text characters
        text_data = re.sub('[^a-zA-Z]', ' ', text_data)

        # lowercase
        text_data = text_data.lower()

        # remove stopwords
        text_data = remove_stop_words(text_data)
        
        # lower case and make array
        text_data = text_data.split()

        # lemmatize each word
        wl = WordNetLemmatizer()
        text_data = [wl.lemmatize(word) for word in text_data]

        # append to corpus
        corpus.append(text_data)

    return corpus

In [42]:
cleaned_sentences = clean_strings(df)

100%|████████████████████████████████████████████████████████| 50000/50000 [00:52<00:00, 950.67it/s]


In [43]:
print(df['review'][0])
print(cleaned_sentences[0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

## Transfer Learning w/ GLoVe

In [44]:
from gensim.test.utils import get_tmpfile, datapath
from gensim.models import KeyedVectors, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
import os
import numpy as np

In [45]:
# download 6 billion glove dataset from https://nlp.stanford.edu/data/glove.6B.zip

In [46]:
# get glove file and make temp glove word2vec file
glove_file = datapath(os.path.abspath('./glove.6B.100d.txt'))
glove2vec_file = get_tmpfile(os.path.abspath('./glove_word2vec_6B.100d.txt'))

# convert glove embeddings to word2vec format
_ = glove2word2vec(glove_file, glove2vec_file)
glove_vectors = KeyedVectors.load_word2vec_format(glove2vec_file)

  _ = glove2word2vec(glove_file, glove2vec_file)


In [47]:
# build initial embedding on our dataset
base_model = Word2Vec(vector_size=100, min_count=5)
base_model.build_vocab(cleaned_sentences)

# train on our dataset
base_model.train(cleaned_sentences, total_examples=base_model.corpus_count, epochs=base_model.epochs)
base_model_wv = base_model.wv

del base_model

In [48]:
# build initial embedding on our dataset
glove_model = Word2Vec(vector_size=100, min_count=5)
glove_model.build_vocab(cleaned_sentences)
total_examples = glove_model.corpus_count

# add GLoVe vocab and weights
glove_model.build_vocab(glove_vectors.index_to_key, update=True)

# # train on dataset
glove_model.train(cleaned_sentences, total_examples=total_examples, epochs=glove_model.epochs)
glove_model_wv = glove_model.wv

del glove_model



In [49]:
base_model_wv.most_similar('romance')

[('romantic', 0.8110664486885071),
 ('sappy', 0.6130224466323853),
 ('intrigue', 0.6038162708282471),
 ('drama', 0.6029313206672668),
 ('triangle', 0.5883470177650452),
 ('sentimental', 0.567913293838501),
 ('melodrama', 0.5677792429924011),
 ('friendship', 0.5551624298095703),
 ('fairytale', 0.5538564920425415),
 ('bittersweet', 0.5528250336647034)]

In [50]:
glove_model_wv.most_similar('romance')

[('romantic', 0.8211200833320618),
 ('triangle', 0.6186382174491882),
 ('bittersweet', 0.607478141784668),
 ('intrigue', 0.5946118831634521),
 ('drama', 0.5940388441085815),
 ('sentimental', 0.5851631760597229),
 ('adventure', 0.5840432643890381),
 ('sappy', 0.5820758938789368),
 ('screwball', 0.5686012506484985),
 ('melodrama', 0.5614430904388428)]

## Preparing Data

In [163]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [164]:
def prep_data(embedding_model_wv, random_state=69, maxlen=200):

    # cleaned strings
    X_raw = cleaned_sentences
    # 0 for negative, 1 for positive
    y_raw = df['sentiment']


    # split randomly
    X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2, random_state=19)


    # tokenize using embedding model
    word_tokenizer = Tokenizer()
    word_tokenizer.word_index = embedding_model_wv.key_to_index

    X_train = word_tokenizer.texts_to_sequences(X_train)
    X_test = word_tokenizer.texts_to_sequences(X_test)

    # pad all sequences and convert to numpy arrays
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen, value=len(embedding_model_wv.key_to_index))
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen, value=len(embedding_model_wv.key_to_index))

    X_train, y_train = np.array(X_train, dtype=np.int64), np.array(y_train, dtype=np.int64)
    X_test, y_test = np.array(X_test, dtype=np.int64), np.array(y_test, dtype=np.int64)

    
    # create embedding matrix with extra row for padding
    glove_embedding = embedding_model_wv.vectors
    pad_embedding = np.zeros((1, glove_embedding.shape[1]))

    embedding_matrix = np.vstack((glove_embedding, pad_embedding))


    return X_train, X_test, y_train, y_test, embedding_matrix

In [165]:
base_X_train, base_X_test, base_y_train, base_y_test, base_embedding = prep_data(base_model_wv)
glove_X_train, glove_X_test, glove_y_train, glove_y_test, glove_embedding = prep_data(glove_model_wv)

## Build and Train Models

In [166]:
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM

In [167]:
def train_and_test_model(model, X_train, X_test, y_train, y_test, epochs=5):

    print('Training Model...')
    model_hist = model.fit(X_train, y_train, batch_size=64, epochs=epochs, verbose=1, validation_split=0.2)

    print('Testing Model...')
    score = model.evaluate(X_test, y_test, verbose=1)

    return model_hist, score

### CNN

In [168]:
def build_cnn(embedding, X_train):
    cnn = Sequential(
        layers=(
            # use our GLoVe-based transfer-learned embedding
            Embedding(*embedding.shape, weights=[embedding], input_length=X_train.shape[1], trainable=False),
            # convolution to 128-dim space
            Conv1D(128, 5, activation='relu'),
            GlobalMaxPooling1D(),
            Dense(1, activation='sigmoid')
        )
    )

    cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

    print(cnn.summary())

    return cnn

In [169]:
base_cnn = build_cnn(base_embedding, base_X_train)

base_cnn_hist, base_cnn_score = train_and_test_model(base_cnn, base_X_train, base_X_test, base_y_train, base_y_test)

Model: "sequential_28"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_28 (Embedding)    (None, 200, 100)          3474600   
                                                                 
 conv1d_7 (Conv1D)           (None, 196, 128)          64128     
                                                                 
 global_max_pooling1d_7 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_28 (Dense)            (None, 1)                 129       
                                                                 
Total params: 3,538,857
Trainable params: 64,257
Non-trainable params: 3,474,600
_________________________________________________________________
None
Training Model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Testing Model...


In [170]:
glove_cnn = build_cnn(glove_embedding, glove_X_train)

glove_cnn_hist, glove_cnn_score = train_and_test_model(glove_cnn, glove_X_train, glove_X_test, glove_y_train, glove_y_test)

Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_29 (Embedding)    (None, 200, 100)          3491400   
                                                                 
 conv1d_8 (Conv1D)           (None, 196, 128)          64128     
                                                                 
 global_max_pooling1d_8 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_29 (Dense)            (None, 1)                 129       
                                                                 
Total params: 3,555,657
Trainable params: 64,257
Non-trainable params: 3,491,400
_________________________________________________________________
None
Training Model...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Testing Model...


### LSTM

In [171]:
def build_lstm(embedding, X_train):
    lstm = Sequential(
        layers=(
            # use our GLoVe-based transfer-learned embedding
            Embedding(*embedding.shape, weights=[embedding], input_length=X_train.shape[1], trainable=False),
            # lstm outputs to 128-dim space
            LSTM(128),
            Dense(1, activation='sigmoid')
        )
    )

    lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

    print(lstm.summary())

    return lstm

In [172]:
LSTM_EPOCS=15

In [173]:
base_lstm = build_lstm(base_embedding, base_X_train)

base_lstm_hist, base_lstm_score = train_and_test_model(base_lstm, base_X_train, base_X_test, base_y_train, base_y_test, epochs=LSTM_EPOCS)

Model: "sequential_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_30 (Embedding)    (None, 200, 100)          3474600   
                                                                 
 lstm_21 (LSTM)              (None, 128)               117248    
                                                                 
 dense_30 (Dense)            (None, 1)                 129       
                                                                 
Total params: 3,591,977
Trainable params: 117,377
Non-trainable params: 3,474,600
_________________________________________________________________
None
Training Model...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Testing Model...


In [174]:
glove_lstm = build_lstm(glove_embedding, glove_X_train)

glove_lstm_hist, glove_lstm_score = train_and_test_model(glove_lstm, glove_X_train, glove_X_test, glove_y_train, glove_y_test, epochs=LSTM_EPOCS)

Model: "sequential_31"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_31 (Embedding)    (None, 200, 100)          3491400   
                                                                 
 lstm_22 (LSTM)              (None, 128)               117248    
                                                                 
 dense_31 (Dense)            (None, 1)                 129       
                                                                 
Total params: 3,608,777
Trainable params: 117,377
Non-trainable params: 3,491,400
_________________________________________________________________
None
Training Model...
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Testing Model...


## Save to WandB

In [175]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [176]:
import wandb

wandb.login()



True

In [177]:
run = wandb.init(
  project='IMDB Sentiment Analysis',
)

wandb.config = {
  'batch_size': 64
}

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Base CNN Train Acc,▁▄▆▇█
Base CNN Train Loss,█▅▃▂▁
Base CNN Val Acc,▁█▆▅█
Base CNN Val Loss,▃▁▂▆█
Base LSTM Train Acc,▁▂▂▂▆▆▆▇▇▇▇▇███
Base LSTM Train Loss,██▇█▄▃▃▃▃▂▂▂▂▁▁
Base LSTM Val Acc,▅▁▅▇███████████
Base LSTM Val Loss,▆█▆▄▂▁▁▁▁▁▁▁▁▂▂
GLoVe CNN Train Acc,▁▄▆▇█
GLoVe CNN Train Loss,█▅▃▂▁

0,1
Base CNN Train Acc,0.97853
Base CNN Train Loss,0.08612
Base CNN Val Acc,0.86937
Base CNN Val Loss,0.35009
Base LSTM Train Acc,0.95172
Base LSTM Train Loss,0.13524
Base LSTM Val Acc,0.869
Base LSTM Val Loss,0.35258
GLoVe CNN Train Acc,0.97528
GLoVe CNN Train Loss,0.09679


In [178]:
for i in range (20):
    log = dict()
    for title, hist in zip(
        ['Base CNN', 'Base LSTM', 'GLoVe CNN', 'GLoVe LSTM'],
        [base_cnn_hist, base_lstm_hist, glove_cnn_hist, glove_lstm_hist]
    ):
        for data_type in ['Train', 'Val']:
            for label in ['Loss', 'Acc']:
                try:
                    pref = 'val_' if data_type == 'Val' else ''
                    log[f'{title} {data_type} {label}'] = hist.history[f'{pref}{label.lower()}'][i]
                except:
                    continue
    wandb.log(log)