In [2]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers, models
import nltk
from sklearn.metrics import classification_report, accuracy_score
import keras

In [9]:
data = pd.read_csv('reviews.csv')


In [11]:
def remove_tags(string):
    removelist = ""
    result = re.sub(r'<.*?>', '', string)  # remove HTML tags
    result = re.sub(r'https?://\S+', '', result)  # remove URLs
    result = re.sub(r'[^a-zA-Z\s]', ' ', result)  # remove non-alphanumeric characters
    result = result.lower()  # convert to lowercase
    return result

In [10]:
data.head()

Unnamed: 0,Title,Review,Date,Rating
0,Never seen the anime and still enjoyed it,I have to admit that I've never watched any of...,4 September 2023,8.0
1,Not My Kind of Show. But This I Loved.,I'm a 60yo man & old school like the The Marin...,4 September 2023,9.0
2,I'm Shocked By How Good This Is...,Every live action anime that I've briefly seen...,1 September 2023,9.0
3,See Netflix?? You CAN do it...,This is bar none one of the best live-action a...,4 September 2023,10.0
4,WE WANT SEASON 2,"Being a one piece fan myself, I was a bit inse...",31 August 2023,10.0


In [13]:
data['Review'] = data['Review'].apply(remove_tags)


In [15]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

data['Review'] = data['Review'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\btc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\btc\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\btc\AppData\Roaming\nltk_data...


True

In [17]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()


In [19]:
def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)])

data['Review'] = data['Review'].apply(lemmatize_text)

# Calculate average length of reviews
avg_length = np.mean([len(review.split()) for review in data['Review']])
print("Average length of each review:", avg_length)


Average length of each review: 99.3371298405467


In [21]:
positive_reviews = data[data['Rating'] >= 5.0].shape[0]
negative_reviews = data.shape[0] - positive_reviews

print(f"Percentage of reviews with positive sentiment is {positive_reviews / data.shape[0] * 100:.2f}%")
print(f"Percentage of reviews with negative sentiment is {negative_reviews / data.shape[0] * 100:.2f}%")

Percentage of reviews with positive sentiment is 91.23%
Percentage of reviews with negative sentiment is 8.77%


In [27]:
labels = data['Rating'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)



In [28]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(data['Review'], encoded_labels, stratify=encoded_labels)

In [29]:
vocab_size = 3000
embedding_dim = 100
max_length = int(avg_length)  # can adjust based on actual data
padding_type = 'post'
trunc_type = 'post'


In [30]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)


In [32]:
import keras
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])



In [33]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [34]:
model.summary()

In [35]:
num_epochs = 10  # Increased epochs for better training
history = model.fit(train_padded, train_labels, epochs=num_epochs, validation_split=0.1, verbose=1)


Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 127ms/step - accuracy: 0.0244 - loss: -1.7958 - val_accuracy: 0.0000e+00 - val_loss: -33.0909
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 78ms/step - accuracy: 0.0079 - loss: -50.1055 - val_accuracy: 0.0000e+00 - val_loss: -106.7176
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 77ms/step - accuracy: 0.0139 - loss: -124.1162 - val_accuracy: 0.0000e+00 - val_loss: -191.0455
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 82ms/step - accuracy: 0.0109 - loss: -212.7804 - val_accuracy: 0.0000e+00 - val_loss: -280.0308
Epoch 5/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 80ms/step - accuracy: 0.0179 - loss: -300.8469 - val_accuracy: 0.0000e+00 - val_loss: -377.1138
Epoch 6/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 80ms/step - accuracy: 0.0235 - loss: -400.5407 - val_accuracy: 0.0000e+00

In [41]:
predictions = model.predict(test_padded)
pred_labels = [1 if pred >= 0.5 else 0 for pred in predictions]



[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step


In [37]:
print("Accuracy of prediction on test set:", accuracy_score(test_labels, pred_labels))


Accuracy of prediction on test set: 0.013636363636363636


In [38]:
example_sentences = [
    "The movie was very touching and heartwarming",
    "I have never seen a terrible movie like this",
    "The movie plot is terrible but it had good acting"
]

example_sequences = tokenizer.texts_to_sequences(example_sentences)
example_padded = pad_sequences(example_sequences, padding=padding_type, maxlen=max_length)

predictions = model.predict(example_padded)
pred_labels = ['Positive' if pred >= 0.5 else 'Negative' for pred in predictions]

for i, sentence in enumerate(example_sentences):
    print(f"{sentence}\nPredicted sentiment: {pred_labels[i]}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
The movie was very touching and heartwarming
Predicted sentiment: Positive
I have never seen a terrible movie like this
Predicted sentiment: Positive
The movie plot is terrible but it had good acting
Predicted sentiment: Positive
