In [8]:
import numpy as np
import pandas as pd
import regex as re

from string import digits, punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.preprocessing import LabelBinarizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Embedding, Dense, Flatten, Conv1D, Dropout, MaxPooling1D, Activation, GlobalMaxPooling1D, Input, Conv2D, Reshape, MaxPooling2D, concatenate
from tensorflow.keras.optimizers import Adam, SGD

from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import to_categorical

### Utils

In [2]:
# Split training data by sentiment
def split_sentiment(df):

    df_pos = df[df["sentiment"] == 1]
    df_neg = df[df["sentiment"] == 0]

    return df_pos, df_neg


# Split training and testing data
def split_train_test(df_pos, df_neg, random_state=None):

    df_train_pos, df_test_pos = train_test_split(df_pos, test_size=0.2, random_state=random_state)
    df_train_neg, df_test_neg = train_test_split(df_neg, test_size=0.2, random_state=random_state)

    df_train = pd.concat([df_train_pos, df_train_neg])
    df_test = pd.concat([df_test_pos, df_test_neg])

    return df_train, df_test

### Data Preprocessing

In [3]:
stopwords_set = set(stopwords.words("english"))

def preprocess(text):
    text = text.lower()
    
    url = re.compile(r"https?://\S+|www\.\S+")
    text = url.sub(r"", text)
    
    rem_digits = str.maketrans(digits, " "*len(digits))
    text = text.translate(rem_digits)
    
    rem_punctuation = str.maketrans(punctuation, " "*len(punctuation))
    text = text.translate(rem_punctuation)
    text = re.sub(r'[^\w\s]',' ', text)
    
    tokens = word_tokenize(text)
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word, pos="v") for word in tokens]
    
    tokens = [word for word in tokens if not word in stopwords_set]
    
    return tokens

In [4]:
df = pd.read_csv("IMDB Dataset.csv")
df["token"] = df["review"].apply(lambda x: preprocess(x))
df["sentiment"] = df["sentiment"].apply(lambda x: 1 if x == "positive" else 0)
df["clean"] = df["token"].apply(lambda x: " ".join(x))
df.head(5)

Unnamed: 0,review,sentiment,token,clean
0,One of the other reviewers has mentioned that ...,1,"[one, reviewers, mention, watch, oz, episode, ...",one reviewers mention watch oz episode hook ri...
1,A wonderful little production. <br /><br />The...,1,"[wonderful, little, production, br, br, film, ...",wonderful little production br br film techniq...
2,I thought this was a wonderful way to spend ti...,1,"[think, wonderful, way, spend, time, hot, summ...",think wonderful way spend time hot summer week...
3,Basically there's a family where a little boy ...,0,"[basically, family, little, boy, jake, think, ...",basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[petter, mattei, love, time, money, visually, ...",petter mattei love time money visually stun fi...


### Naive Bayes

In [6]:
# Create x and y for training and testing
def create_xy(df_train, df_test):
    cv = CountVectorizer().fit(df_train["clean"])

    x_train = cv.transform(df_train["clean"]).toarray()
    x_test = cv.transform(df_test["clean"]).toarray()

    y_train = np.array(df_train["sentiment"])
    y_test = np.array(df_test["sentiment"])

    return x_train, x_test, y_train, y_test


# Feature counts and log probabilities
def feature_log_probs(x_train, y_train):
    y = LabelBinarizer().fit_transform(y_train)

    if y.shape[1] == 1:
        y = np.concatenate((1 - y, y), axis=1)

    fc = np.matmul(y.T, x_train)

    smoothed_fc = fc + 1
    smoothed_cc = smoothed_fc.sum(axis=1)
    log_probs = np.log(smoothed_fc) - np.log(smoothed_cc.reshape(-1, 1))

    return log_probs


# Predict using posterior probabilities
def predict(x_test, log_probs):
    posterior = np.matmul(x_test, log_probs.T)
    prediction = np.argmax(posterior, axis=1)

    return prediction

In [7]:
df_pos, df_neg = split_sentiment(df)
df_train, df_test = split_train_test(df_pos, df_neg, 222)

x_train, x_test, y_train, y_test = create_xy(df_train, df_test)
log_probs = feature_log_probs(x_train, y_train)

y_predict = predict(x_test, log_probs)
accuracy = metrics.accuracy_score(y_test, y_predict)

print(f"Accuracy of Naive Bayes Model: {accuracy}")

Accuracy of Naive Bayes Model: 0.8524


### CNN

In [18]:
# Training and testing data

print(f"Size of training data: {len(df_train)}")
print(f"Size of testing data: {len(df_test)}")

Size of training data: 40000
Size of testing data: 10000


In [9]:
# Create vocab and its length

train_words = [word for text in df_train["token"] for word in text]
train_text_length = [len(text) for text in df_train["token"]]

vocab = list(set(train_words))
vocab_size = len(vocab)

print(f"There are {len(train_words)} train words in total and the vocabulary size is {vocab_size}.")

There are 4879067 train words in total and the vocabulary size is 78508.


In [10]:
# Create word vectors

train_data = df_train["clean"].tolist()
test_data = df_test["clean"].tolist()

tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(train_data)

word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_data)
test_sequences = tokenizer.texts_to_sequences(test_data)

# max_length = max(max([len(x) for x in train_sequences]), max([len(x) for x in test_sequences]))
max_length = 200

x_train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post")
x_test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post")

ytrain, ytest = np.asarray(df_train["sentiment"]), np.asarray(df_test["sentiment"])

print(max_length)
print(x_train_padded.shape)
print(x_test_padded.shape)
print(ytrain.shape)
print(ytest.shape)
print(f"Found {len(word_index)} unique tokens")

200
(40000, 200)
(10000, 200)
(40000,)
(10000,)
Found 78508 unique tokens


In [19]:
# Build model

model = Sequential()
model.add(Embedding(230000, 256, input_length=max_length))
model.add(Conv1D(filters=128, kernel_size=5))
model.add(Activation("relu"))
model.add(MaxPooling1D(5))
model.add(Conv1D(filters=128, kernel_size=5))
model.add(Activation("relu"))
model.add(MaxPooling1D(5))
model.add(Conv1D(filters=64, kernel_size=5))
model.add(Activation("relu"))
model.add(GlobalMaxPooling1D())
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 256)          58880000  
                                                                 
 conv1d_9 (Conv1D)           (None, 196, 128)          163968    
                                                                 
 activation_9 (Activation)   (None, 196, 128)          0         
                                                                 
 max_pooling1d_8 (MaxPooling  (None, 39, 128)          0         
 1D)                                                             
                                                                 
 conv1d_10 (Conv1D)          (None, 35, 128)           82048     
                                                                 
 activation_10 (Activation)  (None, 35, 128)           0         
                                                      

In [20]:
# Compile model

EPOCHS = 10
LEARNING_RATE = 0.001
DECAY_RATE = LEARNING_RATE / EPOCHS
MOMENTUM = 0.1

optimizer = Adam(learning_rate=LEARNING_RATE)
# optimizer = SGD(learning_rate=LEARNING_RATE, momentum=MOMENTUM, decay=DECAY_RATE)

model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
# Fit model

BATCH_SIZE = 64

history = model.fit(x_train_padded, ytrain, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1, validation_split=0.2)