**Feedforward neural network for Classification**

In [1]:
# ignoring warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

TEXT PREPROCESSING

In [3]:
df = pd.read_csv("../naive_bayes/movie_data.csv", encoding="utf-8")
df.shape

(50000, 2)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [5]:
# clean the text data
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # remove all non-word characters, convert to lowercase, append the emoticons,
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

def tokenizer(text):
    return word_tokenize(text)

stop = stopwords.words("english")

def remove_stopwords(text):
    return [w for w in text if w not in stop]

lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    return [lemmatizer.lemmatize(w) for w in text]

In [6]:
reviews = df["review"].apply(preprocessor).apply(tokenizer).apply(remove_stopwords).apply(word_lemmatizer)

In [7]:
reviews

0        [1974, teenager, martha, moxley, maggie, grace...
1        [ok, really, like, kris, kristofferson, usual,...
2        [spoiler, read, think, watching, movie, althou...
3        [hi, people, seen, wonderful, movie, im, sure,...
4        [recently, bought, dvd, forgetting, much, hate...
                               ...                        
49995    [ok, let, start, best, building, although, har...
49996    [british, heritage, film, industry, control, n...
49997    [even, know, begin, one, family, worst, line, ...
49998    [richard, tyler, little, boy, scared, everythi...
49999    [waited, long, watch, movie, also, like, bruce...
Name: review, Length: 50000, dtype: object

BUILDING OUR EMBEDDING MATRIX

In [8]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=reviews, vector_size=128, window=5, min_count=1, workers=4, sg=1, negative=7)

In [9]:
word2vec_model.train(reviews, total_examples=word2vec_model.corpus_count, epochs=10)

(56937517, 59792650)

In [10]:
word2vec_model.wv["movie"].shape

(128,)

In [11]:
word2vec_model.wv.most_similar("gandalf")

[('aragorn', 0.7307092547416687),
 ('hobbit', 0.6770522594451904),
 ('bilbo', 0.673820972442627),
 ('saurmon', 0.6453695893287659),
 ('thun', 0.6447776556015015),
 ('orcs', 0.6405193209648132),
 ('almghandi', 0.6402015686035156),
 ('galadriel', 0.6353176832199097),
 ('balrog', 0.6341960430145264),
 ('gollum', 0.6246077418327332)]

In [12]:
# saving the model
import os

if not os.path.exists("./models"):
    os.makedirs("./models")

word2vec_model.save("./models/word2vec_model_IMDB.model")

In [13]:
# loading the model
word2vec_model = Word2Vec.load("./models/word2vec_model_IMDB.model")
word2vec_model.wv.most_similar("gandalf")

[('aragorn', 0.7307092547416687),
 ('hobbit', 0.6770522594451904),
 ('bilbo', 0.673820972442627),
 ('saurmon', 0.6453695893287659),
 ('thun', 0.6447776556015015),
 ('orcs', 0.6405193209648132),
 ('almghandi', 0.6402015686035156),
 ('galadriel', 0.6353176832199097),
 ('balrog', 0.6341960430145264),
 ('gollum', 0.6246077418327332)]

In [14]:
labels = df["sentiment"].values
print(len(labels))
print(labels[34], reviews[34])

50000
1 ['guess', 'one', 'sided', 'relationship', 'sort', 'able', 'identify', 'lead', 'character', 'minako', 'yuko', 'tanaka', '50', 'year', 'old', 'woman', 'still', 'pink', 'good', 'health', 'demonstrated', 'daily', 'grinding', 'routine', 'waking', 'extremely', 'early', 'morning', 'prepare', 'milk', 'delivery', 'work', 'lug', 'bottle', 'megmilk', 'bag', 'route', 'around', 'town', 'like', 'clockwork', 'exchange', 'empty', 'bottle', 'full', 'one', 'collect', 'payment', 'issue', 'receipt', 'always', 'one', 'delivery', 'stop', 'right', 'top', 'needing', 'scale', 'long', 'flight', 'stair', 'order', 'achieve', 'customer', 'satisfaction', 'peculiar', 'enough', 'stop', 'happened', 'stop', 'delivering', 'man', 'love', 'almost', 'teenage', 'adult', 'life', 'product', 'appreciated', 'poured', 'sink', 'gone', 'school', 'see', 'talking', 'daily', 'life', 'always', 'seem', 'close', 'physically', 'yet', 'far', 'away', 'eye', 'contact', 'save', 'cursory', 'glance', 'chance', 'little', 'acknowledgemen

In [15]:
labels = df["sentiment"].values

![ffn_classification](./images/ffn_class.png)

DEFINING THE FEATURE VECTORS (sum of embeddings in a text sequence)

In [16]:
# building the feature vectors which would be sum of all the word vectors in a review

def build_feature_vectors(text, model, embed_size):
    feature_vector = np.zeros((embed_size,), dtype="float32")
    num_words = 0
    for word in text:
        if word in model.wv:
            feature_vector = np.add(feature_vector, model.wv[word])
            num_words += 1
    if num_words:
        feature_vector = np.divide(feature_vector, num_words)
    return feature_vector

In [17]:
# applying the function to all the reviews
feature_vectors = [build_feature_vectors(text, word2vec_model, 128) for text in reviews]

In [18]:
feature_vectors = np.array(feature_vectors)
feature_vectors.shape

(50000, 128)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, labels, test_size=0.2)

In [20]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40000, 128), (10000, 128), (40000,), (10000,))

BUILD THE MODEL

In [21]:
import tensorflow as tf

tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [22]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(300, input_dim=128, activation="relu"),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 300)               38700     
                                                                 
 dense_1 (Dense)             (None, 64)                19264     
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 60,077
Trainable params: 60,077
Non-trainable params: 0
__________________________________________

In [23]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10


2023-06-11 17:57:01.023991: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2cb469270>

In [24]:
results = model.evaluate(X_test, y_test)
print("Test accuracy: ", results[1])

Test accuracy:  0.8866999745368958


In [25]:
# saving the model
if not os.path.exists("./models"):
    os.makedirs("./models")

model.save("./models/ffn_classification_IMDB.h5")

In [26]:
model = tf.keras.models.load_model("./models/ffn_classification_IMDB.h5")

# such an awesome movie
review = "such an awesome movie"
review = preprocessor(review)
review = tokenizer(review)
review = remove_stopwords(review)
review = word_lemmatizer(review)

review = build_feature_vectors(review, word2vec_model, 128)
review = np.array(review).reshape(1, -1)

pred = model.predict(review)

if pred > 0.5:
    print("Positive review")
else:
    print("Negative review")

Positive review
