In [10]:
# analyze.py

import math
import sys
import unicodedata
import pandas as pd
import re
import csv
import itertools
from textblob import TextBlob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
nltk.download('wordnet')


# Generating global variables
STOP_PREFIXES = ("@", "#", "http", "&amp")


def keep_chr(ch):
    '''
    Find all characters that are classifed as punctuation
    in Unicode and combine them into a single string.
    
    Inputs:
      - ch (str): Unicode character
        
    Returns: Boolean
    '''
    return unicodedata.category(ch).startswith('P')


PUNCTUATION = " ".join([chr(i) for i in range(sys.maxunicode)
                        if keep_chr(chr(i))])


# Pre-processing stage
def processing(text, lemmatized):
    '''
    Convert a text of a review into a list of strings.

    Inputs:
      - text (str): text representing one review

    Returns: list of words
    '''
    lemmatizer = WordNetLemmatizer()
    split_text = text.split()
    new_text = []

    for word in split_text:
        # Handle trailing and internal punctuation
        word = word.strip(PUNCTUATION)
        word = word.replace("&apos;", "'")
        word = word.replace("quot;", '"')
        word = word.replace("&quot", '"')

        word = word.lower()

        if lemmatized:
            lemmatizer.lemmatize(word)

        # Split word if "/" present
        if "/" in word:
            words = word.split("/")
            new_text += [word for word in words if not
                         bool(re.search(r"\d", word)) and
                         not word.startswith(STOP_PREFIXES)]
        elif (word and not bool(re.search(r"\d", word))
              and not word.startswith(STOP_PREFIXES)):
            new_text.append(word)

    return new_text


def get_stop_words(all_tokens, num_stop_words=20):
    '''
    Obtain the particular stop words (most frequently occurring
    words) in the sample, which may differ from those in a list
    of generic stop words.

    Inputs:
      - all_tokens (list of lists of str): all tokens
      - num_stop_words (int): number of stop words to remove

    Returns: list of most common tokens
    '''
    all_words = list(itertools.chain.from_iterable(all_tokens))
    freq_dist = FreqDist(all_words)
    stop_words = freq_dist.most_common(num_stop_words)

    return [word[0] for word in stop_words]


def make_ngrams(tokens, n):
    '''
    Take the list of words from a single review and create n-grams.

    Input:
      - text (list of str): list of processed words in a review
      - n (int): maximum number of words per n-gram

    Returns: list of 1- to n-word strings for a single review
    '''
    ngrams = []

    for i in range(1, n+1):
        ngrams += [' '.join(tuple(tokens[j:j+i]))
                   for j in range(len(tokens) - i + 1)]

    return ngrams


# Processing Stage
def count_tokens(tokens):
    '''
    Count each distinct token (entity) in a list of tokens.

    Inputs:
      - tokens (list of str): list of tokens

    Returns: dict mapping tokens to counts
    '''
    rv = {}

    for tok in tokens:
        rv[tok] = rv.get(tok, 0) + 1

    return rv


def compute_idf(docs):
    '''
    Calculate the inverse document frequency (idf) for each
    token in a collection of documents (D), where
        idf(t, D) = log(total number of documents in D / 
                        number of documents containing t).

    Inputs:
      - docs (list of list of str): list of lists of tokens

    Returns: dict mapping terms to idf values
    '''
    num_docs = len(docs)

    idf_dict = {}
    docs_set = [set(doc) for doc in docs]
    tokens = set.union(*docs_set)

    for token in tokens:
        docs_with_token = sum([1 for doc in docs_set
                               if (token in doc)])
        idf_dict[token] = math.log(num_docs / docs_with_token)

    return idf_dict


# Vectorizing Stage
def tfidf_vectorize(revs):
    '''
    Calculate the tf_idf for each term per document in a collection
    of documents. By definition,
        tf = 0.5 + 0.5 * (freq_of_term_in_doc / max_freq_in_doc)
    and
        tf_idf = tf * idf.

    Inputs:
      - list of lists of strings (list of str): collection of reviews

    Returns: DataFrame (tf_idf) and dict (idf)
    '''
    token_to_freq_by_rev = [count_tokens(rev) for rev in revs]
    idf = compute_idf(revs)

    for rev in token_to_freq_by_rev:
        max_freq = max(rev.values())
        for token in rev:
            tf = 0.5 + 0.5 * (rev[token] / max_freq)
            rev[token] = tf * idf[token]

    return pd.DataFrame(token_to_freq_by_rev).fillna(0), idf


def get_df_idf_stops(csv_file, n=2, lemmatized=True,
                     num_stop_words=20):
    '''
    Given a dataframe with two columns, rating and text, generate a
    dataframe that vectorizes the text, and join it back with the
    rating column.

    Inputs: 
        csv_file (str): CSV file name
        n (int): range of n-grams to use
        lemmatized (bool): whether or not to lemmatize words
        num_stop_words (int): number of stop words to remove

    Returns: DataFrame, dict (idf), and list (stop words)
    '''

    df = pd.read_csv(csv_file, usecols=[0, 1],
                     names=["Rating", "Text"], header=None)
    all_tokens = [processing(text, lemmatized) for text in df.Text]

    if num_stop_words > 0:
        stop_words = get_stop_words(all_tokens, num_stop_words)
        all_tokens = [[token for token in tokens if token not in stop_words]
                      for tokens in all_tokens]

    ngrams = [make_ngrams(tokens, n) for tokens in all_tokens]

    final_df, idf = tfidf_vectorize(ngrams)
    y_values = df.Rating.astype("category")

    final_df["Rating"] = y_values

    if num_stop_words > 0:
        return final_df, idf, stop_words
    else:
        return final_df, idf, None  # Should we return an empty list here?

[nltk_data] Downloading package wordnet to /Users/jameshu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# crawl_and_scrape.py

import urllib.parse
import requests
import os
import bs4
import urllib3
import certifi
import json
import csv
import re
import time
import random


# Utility functions
MAIN_URL = "https://www.yelp.com"


def read_url(my_url):
    '''
    Load HTML from URL. Return result or empty string if the
    read fails.

    Inputs:
      - my_url (str): URL

    Returns: str
    '''

    pm = urllib3.PoolManager(
        cert_reqs='CERT_REQUIRED',
        ca_certs=certifi.where())

    return pm.urlopen(url=my_url, method="GET").data


def is_absolute_url(url):
    '''
    Determine if a URL is an absolute URL.

    Inputs:
      - url (str): URL

    Returns: Bool
    '''

    if url == "":
        return False
    return urllib.parse.urlparse(url).netloc != ""


def convert_if_relative_url(new_url, main_url=MAIN_URL):
    '''
    Attempt to determine whether new_url is a relative URL and if so,
    use current_url to determine the path and create a new absolute
    URL. Add the protocol, if that is all that is missing.

    Inputs:
      - new_url (str): the path to the restaurants
      - main_url (str): absolute URL

    Returns: str or None

    Examples:
        convert_if_relative_url("/biz/girl-and-the-goat-chicago",
                                "https://www.yelp.com")
        yields "https://www.yelp.com/biz/girl-and-the-goat-chicago"
    '''
    if new_url == "" or not is_absolute_url(main_url):
        return None

    if is_absolute_url(new_url):
        return new_url

    parsed_url = urllib.parse.urlparse(new_url)
    path_parts = parsed_url.path.split("/")

    if len(path_parts) == 0:
        return None

    ext = path_parts[0][-4:]
    if ext in [".edu", ".org", ".com", ".net"]:
        return "http://" + new_url
    else:
        return urllib.parse.urljoin(main_url, new_url)


# Crawling and scraping functions
def get_total_reviews(soup, counter):
    '''
    Given a soup object representing a page, obtain the total
    number of reviews to help the program determine how many
    pages of reviews to scrape.

    Inputs:
      - soup (bs4 object): soup object
      - counter (int): if the program gets blocked by Yelp,
                       how many times should it try again
                       before giving up and skipping (higher
                       number corresponds to longer run-time
                       but fewer pages skipped)

    Returns: (int) total number reviews for a restaurant
    '''
    tag = soup.find("script", type="application/ld+json")

    # Try again if tag cannot be found;
    # number of tries depends on counter
    if not tag:
        for _ in range(counter):
            tag = soup.find("script", type="application/ld+json")
            time.sleep(random.randint(1, 3))
            if tag:
                break
        if not tag:
            return None

    json_object = json.loads(tag.contents[0])
    total_reviews = json_object["aggregateRating"]['reviewCount']

    return total_reviews


def get_reviews_from_page(url, writer, counter):
    '''
    Given a URL and CSV writer object, write all the reviews
    from a given page to the CSV file.

    Inputs: 
      - url (str): URL
      - writer (CSV writer object): CSV writer
      - counter (int): if the program gets blocked by Yelp,
                       how many times should it try again
                       before giving up and skipping (higher
                       number corresponds to longer run-time
                       but fewer pages skipped)
                         
    Returns: None, modifies the CSV file in place
    '''

    html = read_url(url)
    soup = bs4.BeautifulSoup(html, "lxml")
    tag = soup.find("script", type="application/ld+json")

    # Tries again if tag cannot be found, number of tries depend on counter
    if not tag:
        for _ in range(counter):
            tag = soup.find("script", type="application/ld+json")
            time.sleep(random.randint(1, 3))
            if tag:
                break
        if not tag:
            print("Failure at page " + str(url))
            return None

    print("Success at page " + str(url))

    json_object = json.loads(tag.contents[0])
    reviews = json_object["review"]

    for review in reviews:
        rating = review['reviewRating']["ratingValue"]
        text = review["description"]
        row = [rating, text]
        writer.writerow(row)


def crawl_resto(url, writer, counter):
    '''
    Crawl the restaurant and get all reviews from the restaurant

    Inputs:
      - url (str): URL
      - writer (csv writer): writer object
      - counter (int): if the program gets blocked by Yelp,
                       how many times should it try again
                       before giving up and skipping (higher
                       number corresponds to longer run-time
                       but fewer pages skipped)

    Returns: None, modifies the CSV file in place
    '''
    html = read_url(url)
    soup = bs4.BeautifulSoup(html, "lxml")
    total_reviews = get_total_reviews(soup, counter)

    if not total_reviews:
        print('failure at this restaurant ' + str(url))
        return None

    print('sucess at this restaurant' + str(url))

    review_pages = []

    # Each page has 20 reviews, so we increment by 20
    for i in range(0, total_reviews, 20):
        review_pages.append(url + "?start=" + str(i))

    for review_page in review_pages:
        get_reviews_from_page(review_page, writer, counter)

        # Random sleep to avoid being banned by Yelp
        time.sleep(random.randint(1, 3))


def get_links_from_page(url):
    '''
    Given a URL, scrape all other URLs that refer to restaurant
    home pages, and convert it to an absolute URL.

    Inputs: 
      - url (str): URL

    Returns: set of restaurant links from the page
    '''
    html = read_url(url)
    soup = bs4.BeautifulSoup(html, "lxml")
    all_tags = soup.find_all("a", href=True)
    all_links = [tag.get("href") for tag in all_tags]
    good_links = {convert_if_relative_url(link) for link
                  in all_links if link.startswith('/biz')
                  and "?" not in link}

    return good_links


def crawl_city(city_url):
    '''
    Crawl a city and get all the URLs of restaurants within
    the city.

    Inputs:
      - city_url (str): URL of the city's page on Yelp

    Returns: list of restaurant links in city
    '''
    html = read_url(url)
    soup = bs4.BeautifulSoup(html, "lxml")
    
    # Yelp displays 24 pages of reviews for each location
    total_restos = 24
    resto_pages = []

    # Each page has 10 restaurants, so we increment by 10
    for i in range(0, total_restos, 10):
        resto_pages.append(city_url + "&start=" + str(i))

    city_restos = []
    for resto_page in resto_pages:
        city_restos += get_links_from_page(resto_page)
        time.sleep(random.randint(1, 3))  # Random sleep to avoid
                                          # being banned by Yelp

    return city_restos


def crawl_and_scrape(counter=10,
                     city_url=("https://www.yelp.com/""
                               "search?find_desc=&"
                               "find_loc=Chicago%2C%20IL"),
                     csv_repo="scraped_data/"):
    '''
    Crawl the city of Chicago, unless another city url is given,
    and export all reviews from restaurants in that city to a CSV
    file. CSV file does not contain headers.

    Inputs:
      - counter (int): if the program gets blocked by Yelp,
                       how many times should it try again
                       before giving up and skipping (higher
                       number corresponds to longer run-time
                       but fewer pages skipped)
      - city_url (str): Yelp URL of the city
      - csv_repo (str): name of repository in which to store
                        scraped data

    Returns: None, writes a CSV file
    '''
    city_restos = crawl_city(city_url)

    for i, resto in enumerate(city_restos):
        filename = csv_repo + str(i) + ".csv"
        with open(filename, "w") as f:
            csvwriter = csv.writer(f)
            crawl_resto(resto, csvwriter, counter)
            # Random sleep to avoid being banned by Yelp
            time.sleep(random.randint(1, 3))

In [None]:
# merge_data.py


import os
import csv
import pandas as pd
import numpy as np
import glob


# Goal: Collect 10,000 reviews, with 2,000 reviews in each category


def merge_data(scraped_data_dir='scraped_data/', num_samples=10000,
               random_state=1234, write_to_csv=True):
    '''
    First, combine all restaurant reviews into one dataframe.
    Then, group each review by rating and randomly sample 2,000
    reviews from each group. Header argument is determined if
    there is header column in each CSV file from scraped data.

    Return: DataFrame with equal distribution of ratings
    '''
    # Get a list of all CSV filenames in scraped data directory
    all_rest_csv = [scraped_data_dir + file_name for file_name
                    in os.listdir(scraped_data_dir)
                    if file_name.endswith('.csv')]

    # Concatenate all DataFrames together
    df_from_each_file = (pd.read_csv(f) for f in all_rest_csv)
    concatenated_df = pd.concat(df_from_each_file, ignore_index=True)

    # Select (num_samples / 5) from each rating group
    num_samples_per_rating = round(num_samples/5)
    concat_data = (concatenated_df.groupby("Rating")
                   .sample(n=num_samples_per_rating,
                           random_state=random_state)
                   .reset_index(drop=True))
    
    # Write concat_data to CSV
    concat_data.to_csv('merged_data.csv', header=False, index=False)

In [3]:
# model.py

import pandas as pd
import numpy as np
import time
import joblib
import json
from sklearn.svm import LinearSVC, SVC
from sklearn import linear_model, tree, neighbors
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score, accuracy_score
from sklearn import preprocessing
from nltk import pos_tag
# from analyze_words import *
import itertools


def applyModels(model, x_train, y_train):
    '''
    Fit a model to a pair of x and y training data.
    
    Inputs:
      - model (Model): model being fitted
      - x_train (arr): x training data
      - y_train (arr): y training data
    
    Returns: Model
    '''
    model.fit(x_train, y_train)
    return model


def predictModel(model, x_test):
    '''
    Use a model to generate a prediction for y from x testing data.
    
    Inputs:
      - model (Model): model being applied
      - x_test (arr): x testing data
    
    Returns: arr
    '''
    prediction = model.predict(x_test)
    return prediction


def evaluateModel(prediction, y_test):
    '''
    Calculate the accuracy of a model based on the proportion of
    accurate predictions using the testing data. Accuracy is
    weighted by the deviance from the actual rating.
    
    Inputs:
      - prediction (arr): predicted y values
      - y_test (arr): actual y values
    
    Returns: float
    '''
    # Convert into DataFrame for easier handling
    pred_test_df = pd.DataFrame({'predict': prediction,
                                 'actual': y_test}).astype('int')
    pred_test_df['difference'] = (pred_test_df.predict
                                  - pred_test_df.actual).abs()

    num_tests = len(pred_test_df.index)
    total_deviance = pred_test_df['difference'].sum()

    # Maximum deviance is 4 (5-star rating vs. 1-star rating)
    weighted_accuracy = 1 - (total_deviance / (4 * num_tests))

    return weighted_accuracy


def get_weighted_accuracy(x_train, x_test, y_train, y_test, alpha):
    '''
    Calculate weighted accuracy of a model.
    
    Inputs:
      - x_train (arr): x training data
      - x_test (arr): x testing data
      - y_train (arr): y training data
      - y_test (arr): y testing data
      - alpha (float): constant that multiplies regularization term
      
    Returns: float
    '''
    model = linear_model.SGDClassifier(alpha=alpha)
    trained_model = applyModels(model, x_train, y_train)
    prediction = predictModel(trained_model, x_test)
    weighted_accuracy = evaluateModel(prediction, y_test)

    return weighted_accuracy


def transformFeatureSelection(model, x):
    # Need doc string
    return model.transform(x)


def applyFeatureSelection(model, x_train, y_train):
    # Need doc string
    model = model.fit(x_train, y_train)
    return model


def generate_additional_features():
    # In case we want to include number of taggings
    # (like verb, noun) to our columns
    pass


def optimize_model(csv_file, testing_fraction=0.95):
    '''
    Find the optimal combination of parameters (maximum n-gram length,
    whether to lemmatize, number of stop words, and alpha) for the
    suggested star rating model, as well as the corresponding DataFrame, 
    idf dictionary, and list of stop words.
    
    Inputs:
      - csv_file (string): CSV file name
      - testing_fraction (float): proportion of data reserved for testing
    
    Returns: DataFrame, dict (parameters), dict (idf), list of str
    '''
    # Combinations
    ngrams = [1, 2, 3, 4, 5]
    lemmatizes = [True, False]
    stop_words = [0, 10, 20]
    alphas = [0.0001, 0.001, 0.01, 0.1, 1]

    all_combi = list(itertools.product(ngrams, lemmatizes, stop_words, alphas))

    max_accuracy = -1
    best_combi = None
    best_idf = None
    best_df = None
    best_stop = None

    print("Completed initializing.")

    for combi in all_combi:
        ngram, lemmatize, stop_word, alpha = combi
        df, idf, chosen_stops = get_df_idf_stops(csv_file, n=ngram,
                                                 lemmatized=lemmatize,
                                                 num_stop_words=stop_word)
        (x_train, x_test,
         y_train, y_test) = train_test_split(df.drop("Rating", axis=1),
                                             df.Rating,
                                             test_size=testing_fraction,
                                             random_state=33)
        weighted_accuracy = get_weighted_accuracy(x_train, x_test,
                                                  y_train, y_test, alpha)

        print(combi, "Finished testing. | Accuracy: ", weighted_accuracy)

        if weighted_accuracy > max_accuracy:
            max_accuracy = weighted_accuracy
            best_combi = combi
            best_idf = idf
            best_df = df
            best_stop = chosen_stops

    best_combi_dict = {"ngram": best_combi[0], "lemmatize": best_combi[1],
                       "stop_word": best_combi[2], "alpha": best_combi[3]}

    return best_df, best_combi_dict, best_idf, best_stop


def main_modelling(csv_file, testing_fraction=0.95):
    '''
    Generate the optimal model for predicting Yelp review ratings by
    cycling through combinations of parameters and save it as a PKL file.
    
    Inputs:
      - csv_file (string): CSV file name
      - testing_fraction (float): proportion of data reserved for testing
    
    Returns: None, writes PKL file
    '''
    # Input and Model Tuning
    df, comb, idf, stop = optimize_model(csv_file, testing_fraction)

    (x_train, x_test,
     y_train, y_test) = train_test_split(df.drop("Rating", axis=1),
                                         df.Rating,
                                         test_size=testing_fraction,
                                         random_state=33)

    # Feature Selection
    model = linear_model.SGDClassifier(alpha=comb["alpha"])
    trained_model = applyModels(model, x_train, y_train)
    feature_selection_model = SelectFromModel(trained_model)
    trained_feature_selection_model = applyFeatureSelection(feature_selection_model,
                                                            x_train, y_train)
    x_train = transformFeatureSelection(trained_feature_selection_model,
                                        x_train)
    x_test = transformFeatureSelection(trained_feature_selection_model,
                                       x_test)

    final_model = applyModels(model, x_train, y_train)
    prediction = predictModel(final_model, x_test)

    print("Final Model Classification Report")
    print(classification_report(prediction, y_test))
    print("Accuracy Score")
    print(evaluateModel(prediction, y_test))

    # Save best Model
    joblib.dump(final_model, "optimal_args/final_model.pkl")

    # Save best columns, idf, combination, and stop words
    feature_idx = trained_feature_selection_model.get_support()
    column_names = df.drop("Rating", axis=1).columns[feature_idx]
    with open('optimal_args/columns.json', 'w') as f:
        json.dump(list(column_names), f)
    with open('optimal_args/idf.json', 'w') as f:
        json.dump(idf, f)
    with open('optimal_args/combination.json', 'w') as f:
        json.dump(comb, f)
    with open('optimal_args/stop_words.json', 'w') as f:
        json.dump(stop, f)

In [None]:
# main.py

# from analyze_words import *
# from model import *
import sys
import json
import joblib


def user_interface():
    '''Prompt user to input a review, and suggest a star rating.'''
    print("==================================================")
    print("   Welcome to the Suggested Star Rating System!")
    print()
    print("            Copy and paste your review.")
    print()
    print("       Type Control-D to exit the program.")
    print("==================================================")
    print()
    try:
        while True:
            review = input("Enter review here: ")
            review = str(review)
            if len(review) >= 50:
                break
            else:
                print("Please input a longer review.")

        x_array = process_input(review)
        final_model = joblib.load("perfect_model.pkl")
        prediction = predictModel(final_model, [x_array])
        star_rating = int(prediction)

        print("Your suggested star rating is: {}".format(star_rating))
        print("Thank you for using our Suggested Star Rating System!")
    except EOFError:
        sys.exit()


def process_input(user_input):
    '''
    Convert a review input by the user into an array of zeros,
    where each item corresponding to a valid n-gram in the input
    is replaced by the n-gram's tfidf. This allows a review to be
    evaluated by a model.
    
    Inputs:
      - user_input (str): review input by user
      
    Returns: arr
    '''
    with open("columns.json") as f:
        columns = json.load(f)
    with open("idf.json") as f:
        idf = json.load(f)
    with open("combination.json") as f:
        comb = json.load(f)
    with open("stop_words.json") as f:
        stop_words = json.load(f)

    processed_input = processing(user_input, comb["lemmatize"])

    if comb['stop_word'] > 0:
        processed_input = [token for token in processed_input
                           if token not in stop_words]

    ngrams = make_ngrams(processed_input, comb["ngram"])
    tf = compute_tf(ngrams)

    ngrams_set = set(ngrams)
    columns_set = set(columns)
    indices = pd.Index(columns)

    x_array = np.zeros(len(columns))

    for token in ngrams_set:
        if token in columns_set:
            tfidf = tf[token] * idf[token]
            index = indices.get_loc(token)
            x_array[index] = tfidf

    return x_array


def compute_tf(doc):
    '''
    Compute the augmented term frequency (tf) of the tokens
    in a document.

    Inputs: 
      - doc (list of str): a list of tokens

    Returns: dict mapping terms to tf values
    '''
    token_dict = count_tokens(doc)
    tf_dict = {}
    max_count = max(token_dict.values())

    for token, count in token_dict.items():
        tf_dict[token] = 0.5 + 0.5 * (count / max_count)

    return tf_dict


if __name__ == "__main__":
    user_interface()

In [12]:
df, idf, stops = get_df_idf_stops("merged_data.csv")

In [20]:
l1

['gross',
 'bloody',
 'burgers',
 'three',
 'heady',
 'beers',
 'blaring',
 'music',
 'ears',
 'still',
 'left',
 'tip',
 'asked',
 'medium',
 'got',
 'masks',
 'everytime',
 'you',
 'talk',
 'server',
 'so',
 'if',
 "don't",
 'want',
 'loud',
 'metal',
 'or',
 'mask',
 'then',
 'skip',
 'place',
 'gross bloody',
 'bloody burgers',
 'burgers three',
 'three heady',
 'heady beers',
 'beers blaring',
 'blaring music',
 'music ears',
 'ears still',
 'still left',
 'left tip',
 'tip asked',
 'asked medium',
 'medium got',
 'got bloody',
 'bloody tip',
 'tip masks',
 'masks everytime',
 'everytime you',
 'you talk',
 'talk server',
 'server so',
 'so if',
 'if you',
 "you don't",
 "don't want",
 'want bloody',
 'burgers loud',
 'loud metal',
 'metal music',
 'music or',
 'or mask',
 'mask everytime',
 'everytime then',
 'then skip',
 'skip place',
 'update',
 'after',
 'two',
 'weeks',
 'have',
 'received',
 'promised',
 'refund',
 'item',
 'out',
 'order',
 'make',
 'right',
 "pequod's",
 

In [14]:
idf

{'': 5.599422459331958,
 'off putting': 6.725433722188183,
 'rib pasta': 4.8283137373023015,
 'divine main': 7.600902459542082,
 'yes yes': 7.418580902748128,
 'requiring reservations': 8.517193191416238,
 'various sauces': 6.3771270279199666,
 'line be': 7.824046010856292,
 'courtesy covid': 8.517193191416238,
 'small as': 5.7763531674910364,
 'dining limit': 7.1308988302963465,
 'ready have': 9.210340371976184,
 'other pastas': 8.111728083308073,
 'salad which': 6.16581793425276,
 'covid new': 8.517193191416238,
 'perhaps': 4.866534950122499,
 'on butter': 6.812445099177812,
 'want say': 5.991464547107982,
 'figured would': 8.517193191416238,
 'late night': 4.667045589706179,
 'omg so': 7.418580902748128,
 "seminar lou's": 6.812445099177812,
 'waited over': 7.418580902748128,
 'stop here': 8.111728083308073,
 'she has': 6.074846156047033,
 'have seen': 8.517193191416238,
 'differentiate them': 4.853631545286591,
 'character': 6.3771270279199666,
 "simple don't": 7.418580902748128,
 '

In [15]:
stops

['the',
 'and',
 'i',
 'a',
 'to',
 'was',
 'it',
 'of',
 'we',
 'for',
 'in',
 'is',
 'but',
 'that',
 'this',
 'with',
 'they',
 'were',
 'my',
 'not']