#Import Library

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from google.colab import drive
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import utils
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.callbacks import EarlyStopping
import nltk
import re
from nltk.corpus import stopwords
import string

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


#Read Data

In [None]:
data = pd.read_csv('/content/drive/MyDrive/DSM COURSE/NOTEBOOK/NLP/Sentiment Anslysist/IMDB Dataset.csv')

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.shape

(50000, 2)

#Preprocessing

In [None]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["review"] = data["review"].apply(clean)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
texts = data["review"].tolist()
labels = data["sentiment"].tolist()

# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

Now we need to pad the sequences to the same length to feed them into a neural network. Here’s how we can pad the sequences of the texts to have the same length:

In [None]:
sequences = tokenizer.texts_to_sequences(texts)
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [None]:
max_length

1416

In [None]:
padded_sequences.shape

(50000, 1416)

In [None]:
# Encode the string labels to integers
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

In [None]:
labels.shape

(50000,)

In [None]:
labels

array([1, 1, 1, ..., 0, 0, 0])

We are now going to One-hot encode the labels. One hot encoding refers to the transformation of categorical labels into a binary representation where each label is represented as a vector of all zeros except a single 1. This is necessary because machine learning algorithms work with numerical data. So here is how we can One-hot encode the labels:

In [None]:
# One-hot encode the labels
one_hot_labels = utils.to_categorical(labels)

In [None]:
one_hot_labels.shape

(50000, 2)

#Modeling

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(padded_sequences,
                                                one_hot_labels,
                                                test_size=0.2)

##LSTM Model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1,
                              output_dim=128, input_length=max_length),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(units=len(one_hot_labels[0]), activation="softmax")
])

#Adjust the optimizer to use a smaller learning rate
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)


model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])



In [None]:
# Early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Fit the model with larger batch size
history = model.fit(xtrain, ytrain, epochs=10, batch_size=64,
                    validation_data=(xtest, ytest),
                    callbacks=[early_stopping],
                    verbose=1)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1594s[0m 3s/step - accuracy: 0.6503 - loss: 0.5995 - val_accuracy: 0.8882 - val_loss: 0.2787
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1571s[0m 2s/step - accuracy: 0.9134 - loss: 0.2295 - val_accuracy: 0.8960 - val_loss: 0.2597
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1515s[0m 2s/step - accuracy: 0.9446 - loss: 0.1551 - val_accuracy: 0.8979 - val_loss: 0.2844
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1513s[0m 2s/step - accuracy: 0.9641 - loss: 0.1086 - val_accuracy: 0.8881 - val_loss: 0.3212
Epoch 5/10
[1m435/625[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m7:12[0m 2s/step - accuracy: 0.9763 - loss: 0.0744

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuration')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss Model')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
new_sinopsis = "i didn't like the moview"

# Preprocess the input text
input_sequence = tokenizer.texts_to_sequences([new_sinopsis])
padded_input_sequence = pad_sequences(input_sequence, maxlen=max_length)
prediction = model.predict(padded_input_sequence)
predicted_label = label_encoder.inverse_transform([np.argmax(prediction[0])])
print(predicted_label)