In [1]:
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Flatten, Dense, LSTM


In [2]:
# Reading new data

new_df = pd.read_csv(r'C:\Users\asbpi\Desktop\Nit_DS & AI\MY Projects\project_sentiment analysis\new_data.csv')

reviews = new_df['reviews']


In [3]:
# Convert any non-string elements to strings
reviews = [str(review) for review in reviews]


In [4]:
# Remove null or NaN values
reviews = [review for review in reviews if not pd.isnull(review)]


In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(reviews)
padded_sequences = pad_sequences(sequences)


In [6]:
X = padded_sequences
y = new_df['target']


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# RNN

In [9]:
# Build the RNN model
model = Sequential()
model.add(Embedding(len(word_index)+1, 100, input_length=X.shape[1]))
model.add(LSTM(units=128, return_sequences=True))
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(units=32))
model.add(Dense(units=1, activation='sigmoid'))


In [10]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [11]:
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2235199db40>

In [12]:
# Evaluate on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)
print('Loss', loss)

Accuracy: 0.9317460060119629
Loss 0.314641535282135


In [13]:
y_pred = model.predict(X_test)



In [15]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)  # Convert probabilities to class labels if needed




In [16]:
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')


  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
print('precision_score = ', precision)
print('recall_score = ', recall)
print('f1_score = ', f1)

precision_score =  0.008475686570924667
recall_score =  0.09206349206349207
f1_score =  0.01552233296419343


In [18]:
# Checking Prediction

In [19]:
predictions = model.predict(X_test)

for i in range(len(predictions)):
    text = tokenizer.sequences_to_texts([X_test[i]])[0]
    sentiment = "positive" if predictions[i] > 0.5 else "negative"
    print(f"Text: {text}")
    print(f"Predicted sentiment: {sentiment}")
    print("-----------------------------")



Text: love still learn capability
Predicted sentiment: positive
-----------------------------
Text: easy setup
Predicted sentiment: positive
-----------------------------
Text: nan
Predicted sentiment: positive
-----------------------------
Text: purchase prime mostly present find echo plus kitchen counter overwhelming cooking cutting clean especially counter wide begin go back forth show decide feel feature compact honestly regret probably watch movie die great listen music watch quick youtube video even attempt watch jaw comfortable thing eye would think would definitely recommend
Predicted sentiment: positive
-----------------------------
Text: information dislike like convenience
Predicted sentiment: positive
-----------------------------
Text: please fire stick
Predicted sentiment: positive
-----------------------------
Text: love great bathroom listen news shower listen live serius
Predicted sentiment: positive
-----------------------------
Text: speaker loud google home
Predicte

In [20]:
# Top Positive phrases

In [21]:

embedding_weights = model.layers[0].get_weights()[0]
word_index = tokenizer.word_index
reverse_word_index = {index: word for word, index in word_index.items()}


phrase_sentiment_scores = {}


for sequence in sequences:
    phrase = ' '.join([reverse_word_index.get(word_index, '') for word_index in sequence])
    sentiment_score = sum([embedding_weights[word_index] for word_index in sequence])
    phrase_sentiment_scores[phrase] = sentiment_score

# Sort the phrases based on maximum sentiment score within each phrase
sorted_scores = sorted(phrase_sentiment_scores.items(), key=lambda x: max(x[1]), reverse=True)


top_positive_phrases = []
for phrase, score in sorted_scores:
    words = phrase.split()
    if len(words) >= 2 and len(words) <= 3:
        top_positive_phrases.append(phrase)
        if len(top_positive_phrases) >= 20:
            break

# Print the top phrases associated with positive reviews
print("Top Positive phrases :")
for phrase in top_positive_phrases:
    print(f"{phrase}")




Top Positive phrases :
love love love
easy setup love
love easy well
kid love love
easy easy
love echo easy
easy great sound
great sound easy
easy affordable love
love easy
easy family love
great product easy
gift husband great
easy really enjoy
love living room
sound great love
awesome love alexa
great easy
love many option
easy amazing


In [22]:
# Top Negative phrases

In [23]:

embedding_weights = model.layers[0].get_weights()[0]
word_index = tokenizer.word_index
reverse_word_index = {index: word for word, index in word_index.items()}


phrase_sentiment_scores = {}


for sequence in sequences:
    phrase = ' '.join([reverse_word_index.get(word_index, '') for word_index in sequence])
    sentiment_score = sum([embedding_weights[word_index] for word_index in sequence])
    phrase_sentiment_scores[phrase] = sentiment_score

# Sort the phrases based on maximum sentiment score within each phrase
sorted_scores = sorted(phrase_sentiment_scores.items(), key=lambda x: max(x[1]), reverse=False)


top_negative_phrases = []
for phrase, score in sorted_scores:
    words = phrase.split()
    if len(words) >= 2 and len(words) <= 3:
        top_negative_phrases.append(phrase)
        if len(top_negative_phrases) >= 20:
            break

# Print the top phrases associated with negative reviews
print("Top negative phrases :")
for phrase in top_negative_phrases:
    print(f"{phrase}")


Top negative phrases :
participate echo
habla espanol
five need
prime video
fairly useless
use enough
alexa else
work time
work like
use alarm clock
echo work
firestick everything
work really well
buy prime
video call
alexa rock
work wonderfully
always work
work fine
work advertise


# Thank You