## Importing data and testing the actual model

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Read in the data and shuffle it using `df.sample`.

In [2]:
# REMEMBER TO MOUNT THE CSV WHICH CAN BE FOUND IN THE GD
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/IT1244 Project/Movie Review/data.csv'

Mounted at /content/drive


## Preprocessing Data

Now that we have taken in the dataset, we ought to preprocess the reviews and perform feature selection to prepare it for our model.

Following [this link](https://spotintelligence.com/2022/12/21/nltk-preprocessing-pipeline/), we have a rough idea of what we need to do for preprocessing.

1. remove HTML tags
2. convert everything to lowercase
3. tokenize sentences (to make them easier to vectorize & lemmatize)
4. lemmatize the words (reduce words to their base form)




In [3]:
import re
from nltk.stem import WordNetLemmatizer

def clean_text(sentence):
  # to preprocess it, step by step
  # first, remove HTML tags
  pattern = r"<[^>]+>"
  cleaned_text = re.sub(pattern, "", sentence)

  # next, convert all to lowercase
  cleaned_text = cleaned_text.lower()
  return cleaned_text


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline

# Load data
df = pd.read_csv(file_path)
df['Type'] = np.where(df['Type'] == 'pos', 1, 0)


In [5]:
df

Unnamed: 0,Type,Number,Rating,Content
0,1,20935,9,"I just watched ""return from lonesome dove"" and..."
1,1,12390,8,This movie looked like a classic in the cheesy...
2,1,9820,8,Jay Chou plays an orphan raised in a kung fu s...
3,1,883,7,"Ooverall, the movie was fairly good, a good ac..."
4,1,9063,8,"This movie is fun to watch. If you liked ""Dave..."
...,...,...,...,...
49995,0,16046,1,"Anyone remember the first CKY, CKY2K etc..? Ba..."
49996,0,13620,1,John Madden's cinematic interpretation of Edit...
49997,0,16805,1,Lazy movie made by a lazy director. The charac...
49998,0,11556,1,I made the big mistake of actually watching th...


In [6]:
# Data preprocessing
df['Content'] = df['Content'].apply(clean_text)

# Use a text embedding layer to create the neural network

In [7]:
import keras
keras.backend.clear_session()

In [8]:
import tensorflow as tf
import tensorflow_hub as hub
embedding = "https://tfhub.dev/google/nnlm-en-dim50/2"
hub_layer = hub.KerasLayer(embedding, input_shape = [],
                           dtype = tf.string, trainable = True)

In [9]:
sentences_train, sentences_test, y_train, y_test = train_test_split(df['Content'], df['Type'], test_size=0.5)

In [10]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(hub_layer)
model.add(layers.Dense(16, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer (KerasLayer)    (None, 50)                48190600  
                                                                 
 dense (Dense)               (None, 16)                816       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 48191433 (183.84 MB)
Trainable params: 48191433 (183.84 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
model.compile(optimizer = 'adam',
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [12]:
history = model.fit(sentences_train,
                    y_train,
                    epochs = 10,
                    batch_size = 512,
                    validation_split = 0.4)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
loss, accuracy = model.evaluate(sentences_train, y_train,
                                batch_size = 256)
print('Training loss: {:.4f}'.format(loss))
print('Training accuracy: {:.4f}'.format(accuracy))

loss, accuracy = model.evaluate(sentences_test, y_test,
                                batch_size = 256)
print('Testing loss: {:.4f}'.format(loss))
print('Testing accuracy: {:.4f}'.format(accuracy))

Training loss: 0.1599
Training accuracy: 0.9447
Testing loss: 0.3496
Testing accuracy: 0.8669


# Lemmatize & Tokenize our Sentences

In [14]:
def token_lemma(cleaned_text):
  # next, tokenize the sentence
  tokens = nltk.word_tokenize(cleaned_text)
  # next, lemmatize the sentence
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return tokens

# Neural Network using Keras: LSTM

In [15]:
keras.backend.clear_session()

In [16]:
# Perform train, test, validation split again as we want to sample new dataset
# from sklearn.model_selection import train_test_split
# train, test = train_test_split(df['Content'], df['Type'], test_size=0.5)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.4)

In [17]:
# even after the text is lemmatized & tokenized by NLTK package, we still need to pass it into a Keras tokenizer layer

from keras.preprocessing.text import Tokenizer

# tokenizer text and create the vocabulary
tokenizer = Tokenizer(num_words = 10000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(sentences_train)
X_train_model = tokenizer.texts_to_sequences(sentences_train)


In [18]:
from keras.preprocessing.sequence import pad_sequences
X_test_model = tokenizer.texts_to_sequences(sentences_test)
# Pad the sequences so that they all have the same length
X_train_model = pad_sequences(X_train_model, maxlen = 100)
X_test_model = pad_sequences(X_test_model, maxlen = 100)

In [19]:
model = Sequential()
model.add(layers.Embedding(input_dim = 10000, output_dim = 64, input_length = 100))
model.add(layers.LSTM(units = 64))
model.add(layers.Dense(128, activation = 'relu'))
model.add(layers.Dense(1, activation = 'sigmoid')) # don't use softmax for binary_crossentropy, worst mistake of my life!!!
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 64)           640000    
                                                                 
 lstm (LSTM)                 (None, 64)                33024     
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 681473 (2.60 MB)
Trainable params: 681473 (2.60 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [21]:
model.fit(X_train_model,
          y_train,
          batch_size = 512,
          epochs = 10,
          validation_split = 0.4)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7bcfc1c3b430>

In [22]:
loss, accuracy = model.evaluate(X_train_model, y_train)
print('Training loss: {:.4f}'.format(loss))
print('Training accuracy: {:.4f}'.format(accuracy))
loss, accuracy = model.evaluate(X_test_model, y_test)
print('Testing loss: {:.4f}'.format(loss))
print('Testing accuracy: {:.4f}'.format(accuracy))

Training loss: 0.3615
Training accuracy: 0.9272
Testing loss: 0.8914
Testing accuracy: 0.8206


# Using Naive Bayes Classifier

Let's try doing this, but with a less computationally expensive classifier: Naive Bayes.

In [23]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features = 5000)

In [None]:
# If you want to pass lemmatized sentences instead, run this block and not the previous.

# sentences_train = sentences_train.apply(token_lemma)
# sentences_test = sentences_test.apply(token_lemma)
# vectorizer = TfidfVectorizer(max_features = 5000, tokenizer = lambda x: x, lowercase = False)

In [None]:
vectorizer.fit(sentences_train)
X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)

In [24]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train) # no need to pick countvectorizer or tfidf as the pipeline will already convert them

# nb_count = make_pipeline(count, MultinomialNB())
# nb_count.fit(train['Content'], train['Type'])

print("Train Test Accuracy (TF-IDF):", accuracy_score(nb_model.predict(X_test),y_test))
# print("Train Test Accuracy (CountVec)", accuracy_score(nb_count.predict(test['Content']), test['Type']))

Train Test Accuracy (TF-IDF): 0.85664


## Comments (Naive Bayes):




### Before tokenizing & lemmatizing

If I use a Bag-of-Words vectorizer, the accuracy is as follows:
```
Validation Set : 83.57%
Test Set       : 84.144%
```

If I use TF-IDF Vectorizer, the accuracy is as follows:
```
Validation Set : 85.315%
Test Set       : 86.012%
```

### After tokenizing & lemmatizing

If I use a Bag-of-Words vectorizer, the accuracy is as follows:
```
Validation Set : 82.36%
Test Set       : 82.312%
```

If I use TF-IDF Vectorizer, the accuracy is as follows:
```
Validation Set : 84.77%
Test Set       : 84.952%
```

# Using Logistic Regression Classifier

In [25]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(max_iter = 1000)
classifier.fit(X_train, y_train)

print("Test accuracy:", classifier.score(X_test, y_test))

Test accuracy: 0.889


In [26]:
# classifier = make_pipeline(count, LogisticRegression(max_iter = 1000))
# classifier.fit(train['Content'], train['Type'])

# print("Test accuracy:", classifier.score(test['Content'], test['Type']))

## Comments (Logistic Regression)

### Before tokenizing & lemmatizing

If I use a Bag-of-Words vectorizer, the accuracy is as follows:
```
Validation Set : 86.645%
Test Set       : 91.884%
```

If I use TF-IDF Vectorizer, the accuracy is as follows:
```
Validation Set : 88.68%
Test Set       : 90.188%
```

### After tokenizing & lemmatizing

If I use a Bag-of-Words vectorizer, the accuracy is as follows:
```
Validation Set : 85.5%
Test Set       : 85.952%
```

If I use TF-IDF Vectorizer, the accuracy is as follows:
```
Validation Set : 87.74%
Test Set       : 87.548%
```

# Neural Network using Keras: Regular Feed Forward using TFIDF Vectorizer

The general direction of this section comes from this article on practical text classification from the website Real Python.

https://realpython.com/python-keras-text-classification/#your-first-keras-model

In [45]:
keras.backend.clear_session()

In [46]:
input_dim = X_train.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim = input_dim, activation = 'relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [47]:
model.compile(loss='binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10)                50010     
                                                                 
 dense_1 (Dense)             (None, 1)                 11        
                                                                 
Total params: 50021 (195.39 KB)
Trainable params: 50021 (195.39 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


Now, we test our neural network model to see whether it performs well.

In [30]:
X_train.shape

(25000, 5000)

In [48]:
history = model.fit(X_train.toarray(),  #keras throws a tantrum if you pass a csr matrix to it
                    y_train,
                    epochs = 10,
                    validation_split = 0.4) # it runs pretty fast if you set batch_size to 128, each epoch takes around 5 seconds

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Ok so running the model failed, maybe it's because my dataset is too big. Just leave the other stuff here first.

Update 26th March 2024: Change pos/neg to 1/0.

In [49]:
loss, accuracy = model.evaluate(X_train.toarray(), y_train)
print('Training loss: {:.4f}'.format(loss))
print('Training accuracy: {:.4f}'.format(accuracy))
loss, accuracy = model.evaluate(X_test.toarray(), y_test)
print('Testing loss: {:.4f}'.format(loss))
print('Testing accuracy: {:.4f}'.format(accuracy))

Training loss: 0.1962
Training accuracy: 0.9316
Testing loss: 0.3363
Testing accuracy: 0.8725


TF-IDF Vectorizer Neural Network (with Tokenization & Lemmatization) Results!

```
Training loss       : 0.4187
Training accuracy   : 0.9175

Validation loss     : 0.4497
Validation accuracy : 0.8775

Testing loss        : 0.4498
Testing accuracy    : 0.8751
```

CountVectorizer Neural Network (with Tokenization & Lemmatization) Results


```
Training loss       : 0.4218
Training accuracy   : 0.9579

Validation loss     : 0.4734
Validation accuracy : 0.8776

Testing loss        : 0.4730
Testing accuracy    : 0.8788
```


# Using Word2Vec

The source is at https://spotintelligence.com/2023/02/15/word2vec-for-text-classification/#Text_classification_using_Word2Vec_Python

First, we preprocess the text data.

In [33]:
# Train the Word2Vec model
from gensim.models import Word2Vec
sentences = [sentence.split() for sentence in sentences_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

In [34]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

X_train_w2v = tokenizer.texts_to_sequences(sentences_train)
X_test_w2v = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1

In [35]:
# Pad the sequences to a fixed length
max_length = 100
X_train_w2v = pad_sequences(X_train_w2v, maxlen=max_length, padding='post')
# X_val_w2v = pad_sequences(X_val_w2v, maxlen=max_length, padding='post')
X_test_w2v = pad_sequences(X_test_w2v, maxlen=max_length, padding='post')

In [36]:
# Create a weight matrix for the embedding layer
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [37]:
keras.backend.clear_session()

In [38]:
# Define the CNN model
model = Sequential()
model.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(128, 5, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Flatten())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [39]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_w2v, y_train, epochs=10, batch_size=512,
          validation_split = 0.4)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7bcfc1d8b460>

In [40]:
loss, accuracy = model.evaluate(X_train_w2v, y_train)
print('Training loss: {:.4f}'.format(loss))
print('Training accuracy: {:.4f}'.format(accuracy))
# loss, accuracy = model.evaluate(X_val_w2v, y_val)
# print('Validation loss: {:.4f}'.format(loss))
# print('Validation accuracy: {:.4f}'.format(accuracy))
loss, accuracy = model.evaluate(X_test_w2v, y_test)
print('Testing loss: {:.4f}'.format(loss))
print('Testing accuracy: {:.4f}'.format(accuracy))

Training loss: 0.2957
Training accuracy: 0.8924
Testing loss: 0.5691
Testing accuracy: 0.7637


# Final comments

A neural network where the first layer is a text embedding model seems to take the longest to fit. This could be blamed on the fact that the text embedding model takes in many parameters (48 million!) compared to other neural network models taking in vectors.