In [2]:
# Library Import
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import string
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns


2023-09-07 22:55:13.100724: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-07 22:55:13.129468: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-07 22:55:13.337573: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-07 22:55:13.339682: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Download the required NLTK data
nltk.download('punkt')  # Punkt Tokenizer Model
nltk.download('stopwords')  # Stopwords
nltk.download('wordnet')  # WordNet Lemmatizer

[nltk_data] Downloading package punkt to /home/prathiba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/prathiba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/prathiba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv('EcoPreprocessed.csv')

In [5]:
df = df.drop_duplicates(subset='review')
df.head()

Unnamed: 0.1,Unnamed: 0,review,polarity,division
0,3870,able play youtube alexa,0.5,positive
1,62,able recognize indian accent really well drop ...,0.2794,positive
2,487,absolute smart device amazon connect external ...,0.1827,positive
3,3204,absolutely amaze new member family control hom...,0.3682,positive
4,1265,absolutely amaze previously sceptical invest m...,0.2333,positive


In [6]:
# Lowercase the reviews
df['review'] = df['review'].str.lower()

# Remove punctuation
df['review'] = df['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Tokenize the reviews
df['review'] = df['review'].apply(word_tokenize)

# Remove stop words
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].apply(lambda x: [word for word in x if word not in stop_words])

# Lemmatize the words
lemmatizer = WordNetLemmatizer()
df['review'] = df['review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [7]:
# The number of most frequent words to consider
MAX_NB_WORDS = 50000
# Max number of words in each complaint
MAX_SEQUENCE_LENGTH = 250
# This is fixed
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['review'].values)
word_index = tokenizer.word_index

X = tokenizer.texts_to_sequences(df['review'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

In [8]:
# Convert sentiments to one-hot vectors
encoder = LabelEncoder()
encoder.fit(df['division'])
Y = encoder.transform(df['division'])
Y = to_categorical(Y)

In [9]:
# Split into train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)

In [18]:
# Train model
epochs = 5
batch_size = 64

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='sigmoid'))  # 3 is the number of sentiment classes
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                    validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
# Evaluate the model
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.275
  Accuracy: 0.820


In [14]:
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
max_features = 50000
maxlen = 200

def build_model(hp):
    model = Sequential()
    model.add(Embedding(max_features, hp.Int('embedding_dim', min_value=32, max_value=128, step=32), input_length=X.shape[1]))
    model.add(LSTM(units=hp.Int('lstm_units', min_value=32, max_value=128, step=32), dropout=hp.Float('dropout', min_value=0.2, max_value=0.5, step=0.1)))
    model.add(Dropout(hp.Float('Dropout_rate',min_value=0,max_value=0.5,step=0.1)))
    model.add(Dense(3, activation=hp.Choice('dense_activation',values=['relu','sigmoid','softmax']),kernel_initializer='he_normal'))
    model.compile(optimizer='adam', loss=hp.Choice('loss_fn',values=['binary_crossentropy','categorical_crossentropy']), metrics=['accuracy'])
    return model

# Initialize Keras Tuner RandomSearch
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    directory='tuner_dir',
    project_name='lstm_sentiment'
)

# Perform hyperparameter search
tuner.search(X_train, Y_train, validation_split=0.2, epochs=3)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("Best Hyperparameters:")
print(best_hps)

# Build the final model with the best hyperparameters
model = tuner.hypermodel.build(best_hps)
model.fit(X_train, Y_train, validation_split=0.2, epochs=10, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
model.summary()

Trial 5 Complete [00h 00m 19s]
val_accuracy: 0.7873873710632324

Best val_accuracy So Far: 0.8306306600570679
Total elapsed time: 00h 01m 28s
INFO:tensorflow:Oracle triggered exit
Best Hyperparameters:
<keras_tuner.engine.hyperparameters.hyperparameters.HyperParameters object at 0x7f32e52b9c10>
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 250, 96)           4800000   
                                                                 
 lstm_1 (LSTM)               (None, 96)                74112     
                                                                 
 dropout_1 (Dropout)         (None, 96)                0         
                                                                 
 dense_1 (Dense)             (None, 3)                 291       
           

In [15]:
# Print the chosen activation function and loss function
best_activation = best_hps.get('dense_activation')
best_loss_function = best_hps.get('loss_fn')
best_em_dim = best_hps.get('embedding_dim')
best_units = best_hps.get('lstm_units')
best_dropout = best_hps.get('Dropout_rate')
print("Chosen embedding dimension:", best_em_dim)
print("Chosen number of LSTM units:", best_units)
print("Chosen dropout rate:", best_dropout)
print("Chosen Activation Function:", best_activation)
print("Chosen Loss Function:", best_loss_function)

Chosen embedding dimension: 96
Chosen number of LSTM units: 96
Chosen dropout rate: 0.2
Chosen Activation Function: softmax
Chosen Loss Function: categorical_crossentropy


In [16]:
# Evaluate the model
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.491
  Accuracy: 0.835


In [21]:
# Define your labels based on the order of your sentiment classes
labels = ['negative', 'neutral', 'positive']

# Generate a classification report
report = classification_report(Y_test.argmax(axis=1), Y_pred.argmax(axis=1), target_names=labels)
print(report)

              precision    recall  f1-score   support

    negative       0.65      0.29      0.40        89
     neutral       0.64      0.59      0.62        95
    positive       0.86      0.95      0.90       509

    accuracy                           0.82       693
   macro avg       0.72      0.61      0.64       693
weighted avg       0.80      0.82      0.80       693



In [22]:
# Assuming that 'text_to_predict' is the text for which you want to predict the sentiment
text_to_predict = ["The product is bad"]
sequences = tokenizer.texts_to_sequences(text_to_predict)
padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = ['negative', 'neutral', 'positive']
print("Predicted sentiment: ", labels[np.argmax(pred)])

Predicted sentiment:  negative
