In [14]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import joblib
import json
import itertools
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from analyze_words import get_df_idf_stops


def evaluateModel(prediction, y_test):
    '''
    Calculate the accuracy of a model based on the proportion of
    accurate predictions using the testing data. Accuracy is
    weighted by the deviance from the actual rating.

    Inputs:
      - prediction (arr): predicted y values
      - y_test (arr): actual y values

    Returns: float
    '''
    # Convert into DataFrame for easier handling
    pred_test_df = pd.DataFrame({"predict": prediction,
                                 "actual": y_test}).astype("int")
    pred_test_df["difference"] = (pred_test_df.predict
                                  - pred_test_df.actual).abs()

    num_tests = len(pred_test_df)
    total_deviance = pred_test_df["difference"].sum()

    # Maximum deviance is 4 (5-star rating vs. 1-star rating)
    weighted_accuracy = 1 - (total_deviance / (4 * num_tests))

    return weighted_accuracy


def get_weighted_accuracy(x_train, x_test, y_train, y_test, alpha):
    '''
    Calculate weighted accuracy of a model.

    Inputs:
      - x_train (DataFrame): x training data
      - x_test (DataFrame): x testing data
      - y_train (arr): y training data
      - y_test (arr): y testing data
      - alpha (float): constant that multiplies regularization term

    Returns: float
    '''
    model = linear_model.SGDClassifier(alpha=alpha)
    trained_model = model.fit(x_train, y_train)
    prediction = trained_model.predict(x_test)
    weighted_accuracy = evaluateModel(prediction, y_test)

    return weighted_accuracy


def feature_selection(model, x_train, y_train, x_test):
    '''
    Performs feature selection to minimize overfitting

    Inputs:
        - model (Model): model being applied
        - x_train (DataFrame): x training data
        - y_train (arr): y training data
        - x_test (DataFrame): x testing data

    Returns: 2 arrs (filtered x_train, filtered x_test), feature selector obj
    '''
    trained_model = model.fit(x_train, y_train)
    feature_selection_model = SelectFromModel(trained_model)
    trained_feature_selection_model = feature_selection_model.fit(x_train,
                                                                  y_train)
    x_train = trained_feature_selection_model.transform(x_train)
    x_test = trained_feature_selection_model.transform(x_test)

    return x_train, x_test, trained_feature_selection_model


def optimize_model(csv_file, testing_fraction):
    '''
    Find the optimal combination of parameters (maximum n-gram length,
    whether to lemmatize, number of stop words, and alpha) for the
    suggested star rating model, as well as the corresponding DataFrame, 
    idf dictionary, and list of stop words.

    Inputs:
      - csv_file (string): CSV file name
      - testing_fraction (float): proportion of data reserved for testing

    Returns: DataFrame, dict (parameters), dict (idf), list of str
    '''
    # Combinations
    ngrams = [1, 2, 3]
    num_stop_words = [0, 10, 20]
    alphas = [0.0001, 0.001, 0.01, 0.1, 1]

    all_combi = list(itertools.product(ngrams, num_stop_words, alphas))

    max_accuracy = -1
    best_df = None
    best_alpha = None
    best_vectorizer = None

    print("Completed initializing.")

    for combi in all_combi:
        ngram, num_stop_words, alpha = combi
        X, y_values, vectorizer = get_df_idf_stops(csv_file, n=ngram,
                                                   num_stop_words=num_stop_words)
        x_train, x_test, y_train, y_test = \
            train_test_split(X, y_values,
                             test_size=testing_fraction, random_state=33)
        weighted_accuracy = get_weighted_accuracy(x_train, x_test,
                                                  y_train, y_test, alpha)

        print(combi, "Finished testing. | Accuracy: ", weighted_accuracy)

        if weighted_accuracy > max_accuracy:
            max_accuracy = weighted_accuracy
            best_x = X
            best_y = y_values
            best_alpha = alpha
            best_vectorizer = vectorizer

    return best_x, best_y, best_alpha, best_vectorizer


def main_modelling(csv_file='smaller_dataset.csv', testing_fraction=0.2):
    '''
    Generate the optimal model for predicting Yelp review ratings by
    cycling through combinations of parameters and save it as a PKL file.
    Also saves other parameters for user input processing in JSON. 

    Inputs:
      - csv_file (string): CSV file name
      - testing_fraction (float): proportion of data reserved for testing

    Returns: None, writes PKL and JSON files
    '''
    # Input and Model Tuning
    X, y_values, alpha, vectorizer = optimize_model(csv_file,
                                                    testing_fraction)

    x_train, x_test, y_train, y_test = \
        train_test_split(X, y_values,
                         test_size=testing_fraction, random_state=33)

    # Feature Selection
    model = linear_model.SGDClassifier(alpha=alpha)
    x_train, x_test, feature_selector = feature_selection(model, x_train,
                                                          y_train, x_test)
    final_model = model.fit(x_train, y_train)
    prediction = final_model.predict(x_test)

    print("Final Model Classification Report")
    print(classification_report(prediction, y_test))
    print("Accuracy Score")
    print(evaluateModel(prediction, y_test))

    # Save best Model, Vectorizer, and Selector
    joblib.dump(final_model, "new_optimal_args/final_model.pkl")
    joblib.dump(vectorizer, 'new_optimal_args/vectorizer.pkl')
    joblib.dump(feature_selector, 'new_optimal_args/selector.pkl')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rhedintzaaudryna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
n = 2
num_stop_words = 10
alpha = 0.01

In [16]:
%%time
X, y_values, vectorizer = get_df_idf_stops('smaller_dataset.csv', n, num_stop_words)

CPU times: user 22.9 s, sys: 541 ms, total: 23.5 s
Wall time: 23.8 s


In [17]:
testing_fraction = 0.2

In [18]:
x_train, x_test, y_train, y_test = \
        train_test_split(X, y_values,
                         test_size=testing_fraction, random_state=33)

# Feature Selection
model = linear_model.SGDClassifier(alpha=alpha)
x_train, x_test, feature_selector = feature_selection(model, x_train,
                                                      y_train, x_test)
final_model = model.fit(x_train, y_train)
prediction = final_model.predict(x_test)

print("Final Model Classification Report")
print(classification_report(prediction, y_test))
print("Accuracy Score")
print(evaluateModel(prediction, y_test))

# Save best Model, Vectorizer, and Selector
joblib.dump(final_model, "new_optimal_args/final_model.pkl")
joblib.dump(vectorizer, 'new_optimal_args/vectorizer.pkl')
joblib.dump(feature_selector, 'new_optimal_args/selector.pkl')

Final Model Classification Report
              precision    recall  f1-score   support

           1       0.25      0.62      0.36        42
           2       0.02      0.40      0.04         5
           3       0.05      0.33      0.09        40
           4       0.40      0.56      0.47       556
           5       0.90      0.52      0.65      1357

    accuracy                           0.53      2000
   macro avg       0.33      0.48      0.32      2000
weighted avg       0.73      0.53      0.58      2000

Accuracy Score
0.82775


['new_optimal_args/selector.pkl']

In [19]:
main_modelling()

Completed initializing.
(1, 0, 0.0001) Finished testing. | Accuracy:  0.87225
(1, 0, 0.001) Finished testing. | Accuracy:  0.845125
(1, 0, 0.01) Finished testing. | Accuracy:  0.812
(1, 0, 0.1) Finished testing. | Accuracy:  0.8166249999999999
(1, 0, 1) Finished testing. | Accuracy:  0.762
(1, 10, 0.0001) Finished testing. | Accuracy:  0.87225
(1, 10, 0.001) Finished testing. | Accuracy:  0.845625
(1, 10, 0.01) Finished testing. | Accuracy:  0.81575
(1, 10, 0.1) Finished testing. | Accuracy:  0.841125
(1, 10, 1) Finished testing. | Accuracy:  0.753875
(1, 20, 0.0001) Finished testing. | Accuracy:  0.87175
(1, 20, 0.001) Finished testing. | Accuracy:  0.847
(1, 20, 0.01) Finished testing. | Accuracy:  0.8215
(1, 20, 0.1) Finished testing. | Accuracy:  0.75425
(1, 20, 1) Finished testing. | Accuracy:  0.809125
(2, 0, 0.0001) Finished testing. | Accuracy:  0.875875
(2, 0, 0.001) Finished testing. | Accuracy:  0.83475
(2, 0, 0.01) Finished testing. | Accuracy:  0.805625
(2, 0, 0.1) Finishe

In [22]:
import sys
import json
import joblib
import pandas as pd
import numpy as np
from sklearn import linear_model
from textblob import TextBlob


def user_interface():
    '''Prompt user to input a review, and suggest a star rating.'''
    print("==================================================")
    print("   Welcome to the Suggested Star Rating System!")
    print()
    print("            Copy and paste your review.")
    print()
    print("       Type Control-D to exit the program.")
    print("==================================================")
    print()
    try:
        while True:
            review = input("Enter review here: ")
            review = str(review)
            if len(review) >= 50:
                break
            else:
                print("Please input a longer review.")

        x_array = process_input(review)

        final_model = joblib.load("new_optimal_args/final_model.pkl")
        prediction = final_model.predict(x_array)
        star_rating = int(prediction)

        print("Your suggested star rating is: {}".format(star_rating))
        print("Thank you for using our Suggested Star Rating System!")
    except EOFError:
        sys.exit()


def process_input(review):
    '''
    Convert a review input by the user into an array of zeros,
    where each item corresponding to a valid n-gram in the input
    is replaced by the n-gram's tfidf. This allows a review to be
    evaluated by a model.

    Inputs:
      - review (str): review input by user

    Returns: arr
    '''
    vectorizer = joblib.load("new_optimal_args/vectorizer.pkl")
    selector = joblib.load("new_optimal_args/selector.pkl")

    # Fix spelling errors before prediction
    textBlb = TextBlob(review)
    corrected_review = textBlb.correct()

    x_array = selector.transform(vectorizer.transform([corrected_review]))

    return x_array

In [23]:
review = """I know a 5-star rating is potentially controversial for Tian Tian which has been a victim of its own success.  Before all the hullabaloo, before No Reservations, it was just a solid choice for chicken rice at Maxwell Food Centre.  And you know what? It continues to be a solid choice, delivering a solid consistently good, wholesome, flavorful, and traditionally authentic product.  So take away everything that's been written or said about Tian Tian in recent history and it'll stand on its own as a solid choice for chicken rice if you're in the neighborhood.  Is there a better chicken rice out there? Almost certainly, but that misses the entire point of a chicken rice stall in a food centre in Singapore.  At the end of the day, it's all about accessibility.  That these hawker centres were purpose-built to make affordable nutrition accessible.  That hawkers, over the years,  have devoted time and passion into making their offerings actually good is a wondrous unintended consequence of this most Singaporean of experiments.  So yes, Tian Tian is still good, pandemic or no, and hopefully will continue to be a solid go to option when wandering around Maxwell.  Do I think it's the best chicken rice in Singapore...for that answer you'll have to get to know me much better first.
"""

In [24]:
review

"I know a 5-star rating is potentially controversial for Tian Tian which has been a victim of its own success.  Before all the hullabaloo, before No Reservations, it was just a solid choice for chicken rice at Maxwell Food Centre.  And you know what? It continues to be a solid choice, delivering a solid consistently good, wholesome, flavorful, and traditionally authentic product.  So take away everything that's been written or said about Tian Tian in recent history and it'll stand on its own as a solid choice for chicken rice if you're in the neighborhood.  Is there a better chicken rice out there? Almost certainly, but that misses the entire point of a chicken rice stall in a food centre in Singapore.  At the end of the day, it's all about accessibility.  That these hawker centres were purpose-built to make affordable nutrition accessible.  That hawkers, over the years,  have devoted time and passion into making their offerings actually good is a wondrous unintended consequence of thi

In [27]:
vectorizer = joblib.load("new_optimal_args/vectorizer.pkl")
selector = joblib.load("new_optimal_args/selector.pkl")

In [31]:
x_user = selector.transform(vectorizer.transform([review]))

In [32]:
final_model = joblib.load("new_optimal_args/final_model.pkl")

In [34]:
prediction = final_model.predict(x_user)

In [35]:
prediction

array([4])

In [46]:
run main.py

   Welcome to the Suggested Star Rating System!

            Copy and paste your review.

       Type Control-D to exit the program.

Enter review here: Imagine the smallest American town you can. Maybe a place where "Make America Great" signs are fading in front of every other mobile home.  Next, visualize the worst American-style Mexican you ever ate there. Maybe it was a place called Panchos. Maybe it was Senor Pedros. Who knows, it was bad. Maybe you went in, got some refried beans from a can with some instant rice, some tasteless generic salsa, a burrito that was 95% filler. And you remember being annoyed at wasting ten bucks.  Now, think of a restaurant that tastes literally two times worse, and costs $120 for two appetizers, three drinks, and one burrito. That would be the amazing restaurant that is Vatos in Singapore
Your suggested star rating is: 1
Thank you for using our Suggested Star Rating System!


In [1]:
from model import *

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rhedintzaaudryna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
optimize_model(csv_file = "test_data/merged_data.csv", testing_fraction = 0.95)

Completed initializing.
(1, 0, 0.0001) Finished testing. | Accuracy:  0.8361052631578947
(1, 0, 0.001) Finished testing. | Accuracy:  0.8345263157894737
(1, 0, 0.01) Finished testing. | Accuracy:  0.7983947368421053
(1, 0, 0.1) Finished testing. | Accuracy:  0.7546842105263158
(1, 0, 1) Finished testing. | Accuracy:  0.7548157894736842
(1, 10, 0.0001) Finished testing. | Accuracy:  0.8277894736842105
(1, 10, 0.001) Finished testing. | Accuracy:  0.8330526315789474
(1, 10, 0.01) Finished testing. | Accuracy:  0.7993421052631579
(1, 10, 0.1) Finished testing. | Accuracy:  0.7546842105263158
(1, 10, 1) Finished testing. | Accuracy:  0.7546842105263158
(1, 20, 0.0001) Finished testing. | Accuracy:  0.8307105263157895
(1, 20, 0.001) Finished testing. | Accuracy:  0.8326315789473684
(1, 20, 0.01) Finished testing. | Accuracy:  0.7941578947368422
(1, 20, 0.1) Finished testing. | Accuracy:  0.7546842105263158
(1, 20, 1) Finished testing. | Accuracy:  0.7549736842105264
(2, 0, 0.0001) Finished 