In [0]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Downloading NLTK data
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Importing the dataset
data = pd.read_csv('/dbfs/FileStore/shared_uploads/n118841@icf.com/Restaurant_Reviews.tsv', sep='\t', quoting=3)

# Function to clean text
def clean_text(review):
    review = re.sub('<.*?>', ' ', review)  # remove HTML tags
    review = re.sub('[^a-zA-Z]', ' ', review)  # keep only letters
    review = review.lower()  # convert to lowercase
    review = review.split()  # split into words
    ps = PorterStemmer()  # stemming
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]  # remove stopwords
    review = ' '.join(review)  # join words back into a single string
    return review

# Cleaning all reviews
corpus = [clean_text(review) for review in data['Review']]

# Using TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=1500)
x = tfidf.fit_transform(corpus).toarray()

# Dependent variable
y = data.iloc[:, 1].values

# Splitting the dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

# Model: RandomForestClassifier
classifier = RandomForestClassifier()

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 50, 100, None],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(x_train, y_train)
best_classifier = grid_search.best_estimator_

# Predicting the Test set results
y_pred = best_classifier.predict(x_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Classification Report
cr = classification_report(y_test, y_pred)
print("Classification Report:")
print(cr)

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Function to predict review sentiment
def predict_review(review):
    cleaned_review = clean_text(review)
    review_vector = tfidf.transform([cleaned_review]).toarray()
    prediction = best_classifier.predict(review_vector)
    return "Positive" if prediction == 1 else "Negative"

# Example prediction
print(predict_review("The food was excellent and the service was great!"))
print(predict_review("The food was horrible and the service was terrible."))

# Function to suggest areas of improvement
def suggest_improvements(review):
    sia = SentimentIntensityAnalyzer()
    aspects = {
        'food': ['food', 'meal', 'dish', 'taste'],
        'service': ['service', 'staff', 'waiter', 'waitress'],
        'ambiance': ['ambiance', 'atmosphere', 'environment'],
        'price': ['price', 'cost', 'value']
    }
    suggestions = []
    cleaned_review = clean_text(review)
    for aspect, keywords in aspects.items():
        if any(word in cleaned_review for word in keywords):
            sentiment_score = sia.polarity_scores(review)
            if sentiment_score['neg'] > sentiment_score['pos']:
                suggestions.append(f"Improve {aspect}")
    return suggestions if suggestions else ["No specific improvements needed"]

# Example suggestion
print(suggest_improvements("The food was horrible and the service was terrible."))

# Full prediction and suggestion function
def full_review_analysis(review):
    sentiment = predict_review(review)
    suggestions = suggest_improvements(review) if sentiment == "Negative" else ["No improvements needed"]
    return sentiment, suggestions

# Example full analysis
review1 = "The food was horrible and the service was terrible."
sentiment, suggestions = full_review_analysis(review1)
print("Review 1: ", review1)
print(f"Review Sentiment: {sentiment}")
print("Suggestions for Improvement:")
for suggestion in suggestions:
    print(suggestion)
review2 = "The food was good and the service was nice."
print("Review 2: ",review2)
sentiment, suggestions = full_review_analysis(review2)
print(f"Review Sentiment: {sentiment}")
print("Suggestions for Improvement:")
for suggestion in suggestions:
    print(suggestion)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=100; total time=   0.5s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=100; total time=   0.6s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, n_estimators=200; total time=   1.2s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=50; total time=   0.2s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=100; total time=   0.5s
[CV] END criterion=gini, max_depth=None, max_features=log2, n_estimators=100; total time=   0.5s
[CV] END criterion=entropy, max_depth=10, max_features=auto, n_estimators=50; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, max_features=auto, n_estimators=50; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, max_features=auto, n_estimators=50; total time=   0.0s
[CV] END criterion=entropy, max_depth=10, max_features=auto, n_est

120 fits failed out of a total of 360.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/databricks/python/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/databricks/python/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/databricks/python/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/databricks/python/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
   

Uploading artifacts:   0%|          | 0/3 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Confusion Matrix:
[[85 12]
 [37 66]]
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.88      0.78        97
           1       0.85      0.64      0.73       103

    accuracy                           0.76       200
   macro avg       0.77      0.76      0.75       200
weighted avg       0.77      0.76      0.75       200

Accuracy: 75.50%
Positive
Negative
['Improve food']
Review 1:  The food was horrible and the service was terrible.
Review Sentiment: Negative
Suggestions for Improvement:
Improve food
Review 2:  The food was good and the service was nice.
Review Sentiment: Positive
Suggestions for Improvement:
No improvements needed
