In [6]:
import math
import sys
import unicodedata
import pandas as pd
import re
import csv
import itertools
import string
from textblob import TextBlob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
nltk.download("wordnet")


# Generating global variables
STOP_PREFIXES = ("@", "#", "http", "&amp")
PUNCTUATION = string.punctuation + "…"  # added the special character elipsis
INTERNAL_PUNCTUATION = set(PUNCTUATION) - {"'"}  # want to keep apostrophe


# Pre-processing stage
def processing(text):
    '''
    Convert a text of a review into a list of strings.

    Inputs:
      - text (str): text representing one review

    Returns: list of words
    '''
    text = re.sub("([^\x00-\x7F])+", " ", text)
    lemmatizer = WordNetLemmatizer()
    split_text = text.split()
    new_text = []

    for word in split_text:
        # Handle trailing punctuation
        word = word.replace("&apos;", "'")
        word = word.replace("quot;", '"')
        word = word.replace("&quot", '"')
        word = word.strip(PUNCTUATION)

        # Handle internal punctuation
        word_set = set(word)
        punc_in_word = word_set.intersection(INTERNAL_PUNCTUATION)

        for punc in punc_in_word:
            word = word.replace(punc, " ")

        for word in word.split():
            word = word.lower()
            lemmatizer.lemmatize(word)
            if (word and not bool(re.search(r"\d", word))
                    and not word.startswith(STOP_PREFIXES)):
                new_text.append(word)

    return new_text


def get_stop_words(corpus, num_stop_words=20):
    '''
    Obtain the particular stop words (most frequently occurring
    words) in the sample, which may differ from those in a list
    of generic stop words.

    Inputs:
      - all_tokens (list of lists of str): all tokens
      - num_stop_words (int): number of stop words to remove

    Returns: list of most common tokens
    '''
    count_vectorizer = CountVectorizer(tokenizer=processing)
    X = count_vectorizer.fit_transform(corpus)

    sum_words = X.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in count_vectorizer.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    words_freq_no_count = [word for word, _ in words_freq]

    return words_freq_no_count[: num_stop_words]


def get_df_idf_stops(csv_file, n, num_stop_words):
    '''
    Given a dataframe with two columns, rating and text, generate a
    dataframe that vectorizes the text, and join it back with the
    rating column.

    Inputs:
        csv_file (str): CSV file containing scraped Yelp reviews
        n (int): range of n-grams to use
        num_stop_words (int): number of stop words to remove

    Returns: DataFrame, dict (idf), and list (stop words)
    '''
    df = pd.read_csv(csv_file)
    corpus = df.Text

    if num_stop_words > 0:
        stop_words = get_stop_words(corpus, num_stop_words)

    # ngrams = [make_ngrams(tokens, n) for tokens in all_tokens]
    idf_vectorizer = TfidfVectorizer(stop_words=stop_words, \
                                     tokenizer=processing, \
                                     ngram_range = (1,n))
    X = idf_vectorizer.fit_transform(corpus)
    final_df = pd.DataFrame(X.toarray(), columns=idf_vectorizer.get_feature_names())
    y_values = df.Rating.astype("category")

    final_df["Rating"] = y_values
    pickle.dump(idf_vectorizer, open("idf_vectorizer.pickle", "wb"))

    if num_stop_words > 0:
        return final_df, stop_words

    return final_df,[]


[nltk_data] Downloading package wordnet to /Users/zqy1998/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import math
import sys
import unicodedata
import pandas as pd
import re
import csv
import itertools
import string
from textblob import TextBlob
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /Users/zqy1998/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:

num_stop_words = 10
df = pd.read_csv("smaller_dataset.csv")
corpus = df.Text

if num_stop_words > 0:
    stop_words = get_stop_words(corpus, num_stop_words)

print(stop_words)
# idf_vectorizer = TfidfVectorizer(
#     stop_words=NEW_STOP_WORDS, tokenizer=custom_tokenize, use_idf=True)
# X = idf_vectorizer.fit_transform(speech_by_month)
# tf_df_all_months = pd.DataFrame(
#     X.toarray(), columns=idf_vectorizer.get_feature_names())


['the', 'and', 'a', 'i', 'to', 'was', 'of', 'it', 'for', 'is']


In [20]:
%%time
num_stop_words = 10
df = pd.read_csv("smaller_dataset.csv")
corpus = df.Text

if num_stop_words > 0:
    stop_words = get_stop_words(corpus, num_stop_words)

print(stop_words)
# idf_vectorizer = TfidfVectorizer(
#     stop_words=NEW_STOP_WORDS, tokenizer=custom_tokenize, use_idf=True)
# X = idf_vectorizer.fit_transform(speech_by_month)
# tf_df_all_months = pd.DataFrame(
#     X.toarray(), columns=idf_vectorizer.get_feature_names())

['the', 'and', 'a', 'i', 'to', 'was', 'of', 'it', 'for', 'is']
CPU times: user 46.8 s, sys: 1.75 s, total: 48.5 s
Wall time: 56.9 s


In [39]:
idf_vectorizer = TfidfVectorizer(stop_words=stop_words, tokenizer=processing)
X = idf_vectorizer.fit_transform(corpus)

X



# y_values = df.Rating.astype("category")

<10000x24991 sparse matrix of type '<class 'numpy.float64'>'
	with 802038 stored elements in Compressed Sparse Row format>

In [32]:
df = pd.DataFrame(
        X.toarray(), columns=idf_vectorizer.get_feature_names())


In [34]:
df.columns

Index([''s', 'a'la', 'aa', 'aaa', 'aaaaallllllllll',
       'aaaaammmmmmaaazzzziiiinnnngggggggg', 'aaaaawwweesssoomee', 'aaaand',
       'aahhhing', 'aak',
       ...
       'zucca', 'zucchini', 'zuccini', 'zuke', 'zuppa', 'zuri', 'zuur',
       'zuurkool', 'zwiebel', 'zzz'],
      dtype='object', length=24991)

In [3]:
final_df= get_df_idf_stops("smaller_dataset.csv", 2, 10)

In [8]:
pd.options.display.max_columns = 100

final_df[0]

Unnamed: 0,'s,'s menu,'s recently,'s review,a'la,a'la carte,aa,aa tips,aaa,aaa without,aaaaallllllllll,aaaaallllllllll other,aaaaammmmmmaaazzzziiiinnnngggggggg,aaaaammmmmmaaazzzziiiinnnngggggggg this,aaaaawwweesssoomee,aaaaawwweesssoomee it's,aaaand,aaaand now,aahhhing,aahhhing over,aak,aak extra,aamazing,aamazing omg,aaron,aaron choose,aaron genuinely,aash,aash reshteh,aback,aback by,aback like,aback or,aback when,abalone,abalone clams,abalone scallop,abandon,abandon cold,abandoned,abandoned parking,abandoned several,abandoned warehouses,abandoning,abandoning even,abbey,abbey champagne,abbot's,abbot's dreyer's,abbq,...,zucchini bread,zucchini cakes,zucchini carrot,zucchini carrots,zucchini cheese,zucchini cherry,zucchini chips,zucchini chopped,zucchini dumplings,zucchini enoki,zucchini fennel,zucchini flowers,zucchini fries,zucchini frites,zucchini fritters,zucchini gnocchi,zucchini have,zucchini in,zucchini lots,zucchini montreal,zucchini noodles,zucchini patatas,zucchini peppers,zucchini pizzas,zucchini pumpkin,zucchini rice,zucchini roasted,zucchini squash,zucchini such,zucchini tomatoes,zucchini too,zucchini vermicelli,zuccini,zuccini he,zuke,zuke salmon,zuppa,zuppa de,zuppa di,zuppa lobster,zuri,zuri absolutely,zuur,zuur vinegar,zuurkool,zuurkool rabbit,zwiebel,zwiebel butterso,zzz,Rating
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083643,0.090213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
