In [96]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import ADASYN

In [98]:
df = pd.read_csv("amazon_alexa.tsv", sep="\t")


In [100]:
print("Class Distribution Before Balancing:\n", df['feedback'].value_counts())

Class Distribution Before Balancing:
 feedback
1    2893
0     257
Name: count, dtype: int64


In [102]:
X = df['verified_reviews'].astype(str)  # Convert reviews to string
y = df['feedback']

In [104]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")  # Increased vocabulary size
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)

In [106]:
max_length = 120  # Increased input length
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding="post", truncating="post")


In [108]:
adasyn = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=5)
X_resampled, y_resampled = adasyn.fit_resample(X_padded, y)


In [110]:
print("Class Distribution After Balancing:\n", pd.Series(y_resampled).value_counts())

Class Distribution After Balancing:
 feedback
0    2916
1    2893
Name: count, dtype: int64


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [114]:
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),  # Increased embedding size
    Bidirectional(LSTM(64, return_sequences=True)),  # Bidirectional LSTM for better feature extraction
    GlobalMaxPooling1D(),  # Reduces overfitting by selecting the most relevant features
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])



In [116]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [118]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

In [120]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test),
                    callbacks=[early_stopping, reduce_lr])

Epoch 1/10
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 95ms/step - accuracy: 0.6146 - loss: 0.6287 - val_accuracy: 0.8064 - val_loss: 0.3960 - learning_rate: 0.0010
Epoch 2/10
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 111ms/step - accuracy: 0.8606 - loss: 0.3194 - val_accuracy: 0.8442 - val_loss: 0.3396 - learning_rate: 0.0010
Epoch 3/10
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 121ms/step - accuracy: 0.9096 - loss: 0.2290 - val_accuracy: 0.8451 - val_loss: 0.3758 - learning_rate: 0.0010
Epoch 4/10
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 137ms/step - accuracy: 0.9458 - loss: 0.1367 - val_accuracy: 0.8520 - val_loss: 0.4615 - learning_rate: 0.0010
Epoch 5/10
[1m146/146[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 136ms/step - accuracy: 0.9645 - loss: 0.0915 - val_accuracy: 0.8563 - val_loss: 0.4892 - learning_rate: 2.0000e-04


In [122]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_acc * 100:.2f}%")

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.8327 - loss: 0.3540

Test Accuracy: 84.42%


In [124]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")

[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step


In [126]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.77      0.83       583
           1       0.80      0.92      0.85       579

    accuracy                           0.84      1162
   macro avg       0.85      0.84      0.84      1162
weighted avg       0.85      0.84      0.84      1162



In [130]:
def predict_sentiment(review, model, tokenizer, max_length=120):
    # Convert review to lowercase (optional)
    review = review.lower()
    
    # Tokenize and convert text to sequence
    sequence = tokenizer.texts_to_sequences([review])
    
    # Pad sequence to match model input size
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding="post", truncating="post")
    
    # Get prediction
    prediction = model.predict(padded_sequence)[0][0]
    
    # Interpret result
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    confidence = prediction if prediction > 0.5 else 1 - prediction
    
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment} (Confidence: {confidence:.2f})")
    return sentiment

In [132]:
predict_sentiment("This product is absolutely fantastic! I love it.", model, tokenizer)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 671ms/step
Review: this product is absolutely fantastic! i love it.
Predicted Sentiment: Positive (Confidence: 0.96)


'Positive'

In [134]:
predict_sentiment("Terrible product. Doesn't work at all.", model, tokenizer)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 106ms/step
Review: terrible product. doesn't work at all.
Predicted Sentiment: Negative (Confidence: 0.96)


'Negative'

In [136]:
predict_sentiment("Absolutely love this! The quality is amazing.", model, tokenizer)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
Review: absolutely love this! the quality is amazing.
Predicted Sentiment: Positive (Confidence: 0.96)


'Positive'

In [138]:
predict_sentiment("Doesn't work as expected. Waste of money.", model, tokenizer)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
Review: doesn't work as expected. waste of money.
Predicted Sentiment: Negative (Confidence: 0.75)


'Negative'