In [1]:
import sys, os, re, json
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from collections import defaultdict, Counter
from typing import List, Tuple, Dict

In [2]:
%matplotlib inline
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Redux
[nltk_data]     Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Redux
[nltk_data]     Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Redux
[nltk_data]     Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
file_business = "yelp_academic_dataset_business.json"
file_review = "yelp_academic_dataset_review.json"

In [4]:
def get_restaurants(file_business):
    with open(file_business, 'r',encoding='utf-8') as file:
        restaurants = set()

        for json_str in file:
            obj = json.loads(json_str)
            if obj["business_id"] and obj["categories"] \
            and "restaurants" in obj["categories"].lower():
                restaurants.add(obj["business_id"])

    return restaurants

In [5]:
%time restaurants = get_restaurants(file_business)

CPU times: total: 1.23 s
Wall time: 1.46 s


In [6]:
print("Number of restaurants:", len(restaurants))

Number of restaurants: 52268


In [7]:
def get_reviews(file_review):
    with open(file_review, 'r',encoding='utf-8') as file:
        text_list = []
        stars_list = []

        for index, json_str in enumerate(file):
            obj = json.loads(json_str)
            if obj["business_id"] and obj["business_id"] in restaurants:
                if obj["text"] and obj["stars"]:
                    text_list.append(obj["text"])
                    stars_list.append(int(obj["stars"]))

    df = pd.DataFrame.from_dict({"text": text_list, "stars": stars_list})
    return df

In [8]:
%time df_review = get_reviews(file_review)

CPU times: total: 45.5 s
Wall time: 46.6 s


In [11]:
df_review = df_review.sample(500000)

In [12]:
df_review["stars"].value_counts(normalize=False)

5    220023
4    119591
1     60195
3     57349
2     42842
Name: stars, dtype: int64

In [13]:
def plot_labels(df, title=None):
    ds_labels = df["stars"].value_counts(normalize=True)
    ds_labels.sort_index(inplace=True)
    plt.figure(figsize=(4,3))
    ax = ds_labels.plot(kind="bar")
    ax.set_xlabel("Stars")
    ax.set_ylabel("Ratio")
    if title is not None:
        plt.savefig(title + ".eps")
    plt.show()

In [15]:
from sklearn.model_selection import train_test_split

df_train_raw, df_test = train_test_split(df_review, test_size=5000, 
                                         random_state=42, shuffle=True)
df_train_raw, df_dev = train_test_split(df_train_raw, test_size=5000,
                                        random_state=42, shuffle=True)

In [17]:
from sklearn.utils import resample

def undersample(df, group_size=5000):
    dfs = []

    for label in df["stars"].value_counts().keys():
        df_group = df[df["stars"] == label]
        df_group_undersampled = resample(df_group, 
                                     replace=False, 
                                     n_samples=group_size, 
                                     random_state=0)
        dfs.append(df_group_undersampled)

    return pd.concat(dfs).sample(frac=1, random_state=0)

In [18]:
df_train = undersample(df_train_raw, 1250)

In [19]:
num_review = df_review.shape[0]
num_train = df_train.shape[0]
num_dev = df_dev.shape[0]
num_test = df_test.shape[0]

In [23]:
df_train["labels"] = df_train["stars"] - 1
df_dev["labels"] = df_dev["stars"] - 1
df_test["labels"] = df_test["stars"] - 1

In [25]:
from nltk.tokenize import word_tokenize

In [None]:
ds_temp = df_train["text"].iloc[:10].map(lambda text: re.sub(r"[^a-zA-Z0-9' ]", ' ', text)).map(lambda text: text.split())

In [27]:
%time df_train["tokens"] = df_train["text"].map(lambda text: re.sub(r"[^a-zA-Z0-9' ]", ' ', text)).map(lambda text: text.split())

CPU times: total: 359 ms
Wall time: 400 ms


In [None]:
df_train["num_tokens"] = df_train["tokens"].map(len)
ax = df_train["num_tokens"].hist(bins=20, 
                                 figsize=(4,3),
                                 weights=np.ones(num_train)/num_train)
ax.set_xlabel("Number of Tokens")
ax.set_ylabel("Ratio")

In [30]:
df_train.drop(columns=["tokens", "num_tokens"], inplace=True)

In [31]:
%time df_review["num_tokens"] = df_review["text"].\
map(lambda text: re.sub(r"[^a-zA-Z0-9' ]", ' ', text)).\
map(lambda text: len(text.split()))

CPU times: total: 5.92 s
Wall time: 6.06 s


In [33]:
print(len(df_review[df_review["num_tokens"] <= 128]) / len(df_review["num_tokens"]))
print(len(df_review[df_review["num_tokens"] <= 256]) / len(df_review["num_tokens"]))

0.750244
0.937732


In [34]:
df_review.drop(columns=["num_tokens"], inplace=True)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def extract_features(df_train, df_dev, df_test):
    vectorizer = TfidfVectorizer(analyzer='word', 
                                 stop_words='english',
                                 ngram_range=(1, 2),
                                 lowercase=True,
                                 min_df=5,
                                 binary=False)
    X_train = vectorizer.fit_transform(df_train["text"])
    X_dev = vectorizer.transform(df_dev["text"])
    X_test = vectorizer.transform(df_test["text"])
    y_train = df_train["stars"].tolist()
    y_dev = df_dev["stars"].tolist()
    y_test = df_test["stars"].tolist()
  
    return X_train, X_dev, X_test, y_train, y_dev, y_test

In [36]:
%time X_train, X_dev, X_test, y_train, y_dev, y_test = extract_features(df_train, df_dev, df_test)

CPU times: total: 4.19 s
Wall time: 4.33 s


In [39]:
def evaluate_model_Xy(model, X, y, y_pred=None, label="Training", model_name="model"):
    if y_pred is None:
        y_pred = model.predict(X)

    print(label + ' Set')
    print("Accuracy:", accuracy_score(y, y_pred))
    print()

    print(classification_report(y, y_pred, digits=4))

    print()
    

def evaluate_model(model, model_name="model",
                   y_train_pred=None, y_dev_pred=None, y_test_pred=None):
    evaluate_model_Xy(model, X_dev, y_dev, label="Validation", model_name=model_name)
    evaluate_model_Xy(model, X_test, y_test, label="Testing", model_name=model_name)

In [40]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, mean_squared_error
from sklearn.metrics import classification_report

In [51]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(n_estimators=500,
                                criterion="gini",
                                max_depth=None, 
                                min_samples_split=2,
                                min_samples_leaf=10, 
                                max_features="auto", 
                                n_jobs=-1,
                                verbose=1,
                                random_state=0,
                                class_weight='balanced')

In [52]:
clf_rf.fit(X_train, y_train)

  warn(
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    3.8s finished


In [53]:
evaluate_model(clf_rf, model_name="rf")

[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.


Validation Set
Accuracy: 0.545

              precision    recall  f1-score   support

           1     0.5747    0.7708    0.6585       250
           2     0.3573    0.3318    0.3441       250
           3     0.3646    0.3722    0.3683       250
           4     0.4733    0.3447    0.3988       250
           5     0.7237    0.7692    0.7458       250

    accuracy                         0.5760      1250
   macro avg     0.4987    0.5177    0.5031      1250
weighted avg     0.5704    0.5820    0.5716      1250




[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 418 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 500 out of 500 | elapsed:    0.1s finished


Testing Set
Accuracy: 0.5978

              precision    recall  f1-score   support

           1     0.5369    0.7439    0.6205       250
           2     0.3675    0.3395    0.3329       250
           3     0.3810    0.3905    0.3657       250
           4     0.5108    0.3581    0.4211       250
           5     0.7455    0.7762    0.7605       250

    accuracy                         0.5451      1250
   macro avg     0.5083    0.5316    0.5122      1250
weighted avg     0.5914    0.5978    0.5483      1250




In [54]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

clf_sgd = make_pipeline(StandardScaler(with_mean=False),
                        SGDClassifier(loss='hinge',
                                      penalty='l2',
                                      alpha=30,
                                      max_iter=1000, 
                                      tol=1e-3,
                                      shuffle=True,
                                      verbose=1,
                                      n_jobs=-1,
                                      random_state=0,
                                      learning_rate='optimal',
                                      early_stopping=True,
                                      class_weight='balanced'))

In [55]:
clf_sgd.fit(X_train, y_train)

-- Epoch 1-- Epoch 1
-- Epoch 1

-- Epoch 1
-- Epoch 1
Norm: 0.12, NNZs: 41588, Bias: -0.005242, T: 22500, Avg. loss: 0.540805
Total training time: 0.01 seconds.
Norm: 0.11, NNZs: 41466, Bias: -0.006059, T: 22500, Avg. loss: 0.535256
Total training time: 0.01 seconds.
Norm: 0.11, NNZs: 41627, Bias: 0.002244, T: 22500, Avg. loss: 0.611678
Total training time: 0.01 seconds.
Norm: 0.11, NNZs: 41567, Bias: -0.005366, T: 22500, Avg. loss: 0.588282
Total training time: 0.01 seconds.
Norm: 0.11, NNZs: 41647, Bias: -0.005802, T: 22500, Avg. loss: 0.602495
Total training time: 0.01 seconds.
-- Epoch 2
-- Epoch 2
-- Epoch 2
-- Epoch 2
-- Epoch 2
Norm: 0.11, NNZs: 41535, Bias: -0.006163, T: 45000, Avg. loss: 0.501805
Total training time: 0.02 seconds.
Norm: 0.11, NNZs: 41663, Bias: -0.005920, T: 45000, Avg. loss: 0.564948
Total training time: 0.02 seconds.
Norm: 0.11, NNZs: 41648, Bias: 0.002127, T: 45000, Avg. loss: 0.572262
Total training time: 0.02 seconds.
Norm: 0.11, NNZs: 41618, Bias: -0.00

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [56]:
evaluate_model(clf_sgd, model_name="sgd")

Validation Set
Accuracy: 0.5786

              precision    recall  f1-score   support

           1     0.5566    0.7724    0.6470       250
           2     0.3706    0.3252    0.3464       250
           3     0.3451    0.3391    0.3421       250
           4     0.4587    0.4051    0.4303       250
           5     0.7496    0.7371    0.7433       250

    accuracy                         0.5786      1250
   macro avg     0.4961    0.5158    0.5018      1250
weighted avg     0.5748    0.5786    0.5739      1250


Testing Set
Accuracy: 0.6028

              precision    recall  f1-score   support

           1     0.5364    0.8160    0.6473       250
           2     0.3911    0.3233    0.3540       250
           3     0.3930    0.3763    0.3845       250
           4     0.4844    0.4316    0.4565       250
           5     0.7765    0.7493    0.7627       250

    accuracy                         0.6028      1250
   macro avg     0.5163    0.5393    0.5210      1250
weighted avg 