# Initial Setup
Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import os

Read in the data and shuffle it using `df.sample`.

In [2]:
# REMEMBER TO MOUNT THE CSV WHICH CAN BE FOUND IN THE GD
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/IT1244 Project/Movie Review/data.csv'
df = pd.read_csv(file_path)
df['Type'] = np.where(df['Type'] == 'pos', 1, 0)
# df.head(10)

Mounted at /content/drive


In [3]:
df_new = df.sample(frac = 0.2)
df_new.head(10)

Unnamed: 0,Type,Number,Rating,Content
21671,1,23914,8,I love this movie. I just saw it for the first...
21622,1,14927,10,I have to say this movie is absolutely amazing...
20014,1,24770,10,Well...I like this movie first of all because ...
9391,1,6112,8,"Todd Rohal is a mad genius. ""Knuckleface Jones..."
41493,0,19627,4,I saw this movie today on the big screen and i...
14699,1,11097,8,A Bugs Life is a great film that is not just f...
35155,0,14846,1,The director of this movie is a famous french ...
45662,0,12523,4,I'm a huge fan of both Emily Watson (Breaking ...
34750,0,24352,2,Solo is a poor film - that cannot be ignored. ...
42396,0,13646,4,"James Stewart plays Johnny Mason, lawyer. Caro..."


# Data Pre-processing

Now that we have taken in the dataset, we ought to preprocess the reviews to prepare it for our model.

Following [this link](https://spotintelligence.com/2022/12/21/nltk-preprocessing-pipeline/), we have a rough idea of what we need to do for preprocessing.

1. remove HTML tags





The line of code below removes HTML tags, a sample execution is provided.

The goal now will be to do this for all entries.

In [4]:
# import stopwords nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
# def remove_stop_words(sentence):
#   words = sentence.split()
#   filtered_words = [word for word in words if word not in stop_words]
#   return ''.join(filtered_words)

In [5]:
df_new.head(10)

Unnamed: 0,Type,Number,Rating,Content
21671,1,23914,8,I love this movie. I just saw it for the first...
21622,1,14927,10,I have to say this movie is absolutely amazing...
20014,1,24770,10,Well...I like this movie first of all because ...
9391,1,6112,8,"Todd Rohal is a mad genius. ""Knuckleface Jones..."
41493,0,19627,4,I saw this movie today on the big screen and i...
14699,1,11097,8,A Bugs Life is a great film that is not just f...
35155,0,14846,1,The director of this movie is a famous french ...
45662,0,12523,4,I'm a huge fan of both Emily Watson (Breaking ...
34750,0,24352,2,Solo is a poor film - that cannot be ignored. ...
42396,0,13646,4,"James Stewart plays Johnny Mason, lawyer. Caro..."


Now, we perform train-test split.

In [6]:
from sklearn.model_selection import train_test_split
y = df_new['Type'].values
content = df_new['Content'].values

content_train, content_test, y_train, y_test = train_test_split(
    content, y, test_size = 0.1, random_state = 1000
)

Next, we tokenise the content.

In [7]:
# using bag of words

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(content_train)
X_train = vectorizer.transform(content_train)
X_test = vectorizer.transform(content_test)
X_train


<9000x49924 sparse matrix of type '<class 'numpy.int64'>'
	with 1228822 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 1000)
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print('Accuracy:', score)

Accuracy: 0.859


# NTLK Bag-of-Words
This section will will use the NTLK Bag of Words.

NLTK Bag-of-Words

In [9]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [10]:
import re
from nltk.stem import WordNetLemmatizer

def preprocess_text(sentence):
  # to preprocess it, step by step
  # first, remove HTML tags
  pattern = r"<[^>]+>"
  cleaned_text = re.sub(pattern, "", sentence)

  # next, convert all to lowercase
  cleaned_text = cleaned_text.lower()

  # next, tokenize the sentence
  tokens = nltk.word_tokenize(cleaned_text)
  # next, lemmatize the sentence
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return tokens


In [11]:
df

Unnamed: 0,Type,Number,Rating,Content
0,1,20935,9,"I just watched ""return from lonesome dove"" and..."
1,1,12390,8,This movie looked like a classic in the cheesy...
2,1,9820,8,Jay Chou plays an orphan raised in a kung fu s...
3,1,883,7,"Ooverall, the movie was fairly good, a good ac..."
4,1,9063,8,"This movie is fun to watch. If you liked ""Dave..."
...,...,...,...,...
49995,0,16046,1,"Anyone remember the first CKY, CKY2K etc..? Ba..."
49996,0,13620,1,John Madden's cinematic interpretation of Edit...
49997,0,16805,1,Lazy movie made by a lazy director. The charac...
49998,0,11556,1,I made the big mistake of actually watching th...


In [12]:
# import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Load data
# data = pd.read_csv('your_dataset.csv')

# Data preprocessing
df['Content'] = df['Content'].apply(preprocess_text)

# Split data into training and testing sets
X = df['Content']
y = df['Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, tokenizer = lambda x: x, lowercase = False)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)



Using `sklearn`

In [14]:
from sklearn import svm
from sklearn.metrics import accuracy_score

SVM = svm.SVC(C = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')
SVM.fit(X_train, y_train)
predictions_SVM = SVM.predict(X_test)
print("SVM Accuracy Score:", accuracy_score(predictions_SVM, y_test))

SVM Accuracy Score: 0.8888


In [19]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [20]:
from thundersvm import SVC

my_svm = SVC(c = 1.0, kernel = 'linear', degree = 3, gamma = 'auto')
my_svm.fit(X_train, y_train)
predict_svm1 = my_svm.predict(X_test)
print("SVM Accuracy Score:", accuracy_score(predictions_SVM, y_test))

OSError: libcusparse.so.9.0: cannot open shared object file: No such file or directory

Now, we use Keras to build our neural network.

In [None]:
from keras.models import Sequential
from keras import layers

In [None]:
import keras
keras.backend.clear_session()

In [None]:
input_dim = X_train.shape[1]
model = Sequential()
model.add(layers.Dense(10, input_dim = input_dim, activation = 'relu'))
model.add(layers.Dense(10, activation = 'softmax'))
model.add(layers.Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer = 'sgd',
              metrics = ['accuracy'])
model.summary()

Now, we test our neural network model to see whether it performs well.

In [None]:
history = model.fit(X_train, y_train,
                    epochs = 10,
                    validation_data = (X_test, y_test),
                    batch_size = 10)

Ok so running the model failed, maybe it's because my dataset is too big. Just leave the other stuff here first.

Update 26th March 2024: Change pos/neg to 1/0.

In [None]:
loss, accuracy = model.evaluate(X_train, y_train)
print('Training loss: {:.4f}'.format(loss))
print('Training accuracy: {:.4f}'.format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test)
print('Testing loss: {:.4f}'.format(loss))
print('Testing accuracy: {:.4f}'.format(accuracy))