The objective of this project is to perform sentiment analysis (only positive and negative) on the 515k hotel review dataset and build an API that can classify review text.
This project covers:
- TF-IDF
- count features
- logistic regression
- naive bayes
- svm
- xgboost
- grid search
- word vectors (Universal Sentence Encoder) model from Tensorflow HUB)
- LSTM
The final LSTM model achieved an accuracy of ~81% in Test Dataset (75:25 split)
# use the latest `tensorflow_text` version
# !pip install tensorflow_text
import numpy as np
from numpy import newaxis
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
# load Universal Sentence Encoder Multilingual Model, v3 is ~300 MB
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'
use = hub.load(module_url)
imported_model = tf.keras.models.load_model('/content/lstm_final_model.h5')
def predict_sentiment(txt):
# generate embedding
emb_txt = use(txt)
# reshape to pass into the model
emb_test_reshaped = emb_txt[:, newaxis, :]
# predict sentiment score
sentiment_val = np.argmax(imported_model.predict(emb_test_reshaped)) # for 1hot-encoded labels
sentiment_val = (imported_model.predict(emb_test_reshaped) > 0.5).astype('int32') # for binary labels
# sentiment score
print("Score:", imported_model.predict(emb_test_reshaped).flatten()[0])
# return sentiment value based on score
return "Positive" if sentiment_val == 1 else "Negative"
sample_text = "I like the room service"
pred_sentiment = predict_sentiment(sample_text)
print(f"The sentiment of this sentence is : {pred_sentiment}")
# output :
# The sentiment of this sentence is : Positive
|-- model
|-- lstm_sentiment_model.h5
|-- tfhub
|-- universal-sentence-encoder-multilingual-large-v3
|-- assets
|-- variables
|-- variables.data-00000-of-00001
|-- variables.index
|-- saved_model.pb
|-- main.py
tensorflow==2.7.0
tensorflow_text==0.12.0
tensorflow-hub==0.12.0
keras==2.7.0
keras-vis==0.4.1
scikit-learn==0.22.2.post1
fastapi==0.70.1
uvicorn==0.16.0