In [None]:
test_set = [
    "Oh, what a delightful experience! I can't contain my excitement, don't know what I enjoyed more, the rude service or the shocking food.",
    "The worst best restaurant I've ever been to. Terrible food and great service.",
    "Compared to a prison cafeteria, this restaurant is amazing! A true culinary wonderland.",
    "If you're into food that defies the laws of physics, this is the place to be.",
    "I can't say I didn't not dislike the food. It's not terrible, maybe.",
    "Not bad, but not great either. It's not like I'll never visit again, but there's no reason to rush back.",
    "Wow, just wow! This place is an absolute dream, if your dream is a nightmare.",
    "The chef here must have a Michelin-starred grandmother. The food is out of this world, literally.",
    "The ambience, if you can even call it that, adds a certain... charm to the dining experience.",
    "I'll give them points for trying. It's definitely a unique dining experience."
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(test_set)

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

logistic_classifier = LogisticRegression()
log = logistic_classifier.fit(X_train, y_train)

logistic_predictions = logistic_classifier.predict(X_test)

logistic_accuracy = accuracy_score(y_test, logistic_predictions)

print(f"Logistic Regression Accuracy: {logistic_accuracy}")

Logistic Regression Accuracy: 0.5


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))
X = tfidf_vectorizer.fit_transform(test_set)

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train, y_train)

logistic_predictions = logistic_classifier.predict(X_test)

logistic_accuracy = accuracy_score(y_test, logistic_predictions)

print(f"Logistic Regression Accuracy with TF-IDF: {logistic_accuracy}")


Logistic Regression Accuracy with TF-IDF: 0.5


In [None]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

test_set = [
    "Oh, what a delightful experience! I can't contain my excitement, don't know what I enjoyed more, the rude service or the shocking food.",
    "The worst best restaurant I've ever been to. Terrible food and great service.",
    "Compared to a prison cafeteria, this restaurant is amazing! A true culinary wonderland.",
    "If you're into food that defies the laws of physics, this is the place to be.",
    "I can't say I didn't not dislike the food. It's not terrible, maybe.",
    "Not bad, but not great either. It's not like I'll never visit again, but there's no reason to rush back.",
    "Wow, just wow! This place is an absolute dream, if your dream is a nightmare.",
    "The chef here must have a Michelin-starred grandmother. The food is out of this world, literally.",
    "The ambience, if you can even call it that, adds a certain... charm to the dining experience.",
    "I'll give them points for trying. It's definitely a unique dining experience."
]

labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

analyzer = SentimentIntensityAnalyzer()

vader_scores = [analyzer.polarity_scores(review)['compound'] for review in test_set]

vader_scores = np.array(vader_scores).reshape(-1, 1)

X_with_vader = np.hstack((X.toarray(), vader_scores))

X_train, X_test, y_train, y_test = train_test_split(X_with_vader, labels, test_size=0.2, random_state=42)

logistic_classifier = LogisticRegression()
logistic_classifier.fit(X_train, y_train)

logistic_predictions = logistic_classifier.predict(X_test)

logistic_accuracy = accuracy_score(y_test, logistic_predictions)

print(f"Logistic Regression Accuracy with TF-IDF and VADER: {logistic_accuracy}")


Logistic Regression Accuracy with TF-IDF and VADER: 0.5


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


I was unable to improve the accuracy using the two methods of:

1. TF-IDF Vectorization:
I used TF-IDF vectorization instead of CountVectorizer and performed sentiment classification with logistic regression and adjusting hyperparameters. While this method did not work TF-IDF can give more weight to informative words and reduce the impact of common words, which can lead to better accuracy, especially when dealing with sentiment analysis.

2. VADER Sentiment Lexicon:
I then decided to experiment with feature engineering and sentiment lexicons using VADER to create additional features for my model. My idea was to extract sentiment scores for each review and add these scores as additional features to the TF-IDF features. This combination of text-based and sentiment-based features could help improve accuracy, especially when dealing with mixed sentiment in reviews.