In [32]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import (
    StringLookup, TextVectorization)

tf.random.set_seed(1)


In [2]:
data = pd.read_csv("data/data.csv")

In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,User_id,Bus_id,Star,Useful,Cool,Funny,Review,State,City,Bus_Ave_Star,User_Review_count,User_Useful_count,User_Funny_count,User_Cool_count,Elite,User_Fans,Users_Ave_Star
0,0,4hBhtCSgoxkrFgHa4YAD-w,bbEXAEFr4RYHLlZ-HFssTA,5.0,0,0,0,"Great burgers,fries and salad! Burgers have a...",CA,Goleta,4.0,922,1687,694,1070,2015201620172018201920202021,51,4.2
1,1,JYYYKt6TdVA4ng9lLcXt_g,SZU9c8V2GuREDN5KgyHFJw,5.0,0,0,0,We were a bit weary about trying the Shellfish...,CA,Santa Barbara,4.0,338,800,144,353,"2012,2013,2014,2015,2016,2017,2018,2019,20,20,...",30,4.12
2,2,Uk3X2AypU8AqvcYEVf7s6Q,eL4lyE7LNoXEMvpcJ8WNVw,3.0,5,0,0,This was a tough one! On the merits of the w...,CA,Santa Barbara,4.0,431,2126,1245,1476,"2012,2013,2014,2015,2016,2017,2018,2019,20,20,...",76,4.05
3,3,LcqNuhqaYt5ekKzaRirmIg,SZU9c8V2GuREDN5KgyHFJw,5.0,2,1,0,"I love trying fresh seafood on piers, wharfs a...",CA,Santa Barbara,4.0,258,452,125,183,201520162017,37,3.99
4,4,gasLVm0KRwrVhPGRcqATjw,CHh0ZFrQcsk4boOItr2Zuw,4.0,1,2,0,I stopped in because I was hungry for some sna...,CA,Carpinteria,3.0,1638,2658,933,2297,20142015201620172018201920202021,105,3.79


In [3]:
data.shape

(53845, 18)

In [4]:
data.City.unique()

array(['Goleta', 'Santa Barbara', 'Carpinteria', 'Montecito',
       'Isla Vista', 'Santa Barbara ', 'Truckee', 'Summerland',
       'Port Hueneme', 'West Hill', 'Los Angeles', 'Mission Canyon',
       'Sparks', 'Kings Beach', 'Cerritos', 'Santa Clara', 'Reno',
       'Real Goleta', 'Aliso Viejo', 'Santa Barbara & Ventura Counties',
       'South Lake Tahoe', 'Santa Barbra', 'Santa  Barbara', 'Salinas',
       'Santa Maria'], dtype=object)

## Data Processing


In [11]:
import nltk
from nltk.corpus import stopwords
from string import punctuation
import re
nltk.download('stopwords')
stops = set(stopwords.words('english'))
stops.add("'s")
stops.remove("not")


def clean(sentence):
    # delete stopwords
    temp = " ".join(filter(lambda x: x not in stops, sentence.split()))
    # Remove punctuation
    temp = temp.translate(str.maketrans('', '', punctuation))
    # remove non-english characters
    temp = temp.encode("ascii", "ignore").decode()
    # Change all to lower case
    temp = temp.lower()
    # Delete numbers
    temp = re.sub(r'[0-9]', "", temp)
    # Delete excessive spaces and return
    return re.sub("  ", " ", temp)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wmy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
data["Review"] = data.Review.apply(clean)

In [14]:
data.head()

Unnamed: 0.1,Unnamed: 0,User_id,Bus_id,Star,Useful,Cool,Funny,Review,State,City,Bus_Ave_Star,User_Review_count,User_Useful_count,User_Funny_count,User_Cool_count,Elite,User_Fans,Users_Ave_Star
0,0,4hBhtCSgoxkrFgHa4YAD-w,bbEXAEFr4RYHLlZ-HFssTA,5.0,0,0,0,great burgersfries salad burgers hint salt pep...,CA,Goleta,4.0,922,1687,694,1070,2015201620172018201920202021,51,4.2
1,1,JYYYKt6TdVA4ng9lLcXt_g,SZU9c8V2GuREDN5KgyHFJw,5.0,0,0,0,we bit weary trying shellfish company wharf of...,CA,Santa Barbara,4.0,338,800,144,353,"2012,2013,2014,2015,2016,2017,2018,2019,20,20,...",30,4.12
2,2,Uk3X2AypU8AqvcYEVf7s6Q,eL4lyE7LNoXEMvpcJ8WNVw,3.0,5,0,0,this tough one on merits wine fairly average i...,CA,Santa Barbara,4.0,431,2126,1245,1476,"2012,2013,2014,2015,2016,2017,2018,2019,20,20,...",76,4.05
3,3,LcqNuhqaYt5ekKzaRirmIg,SZU9c8V2GuREDN5KgyHFJw,5.0,2,1,0,i love trying fresh seafood piers wharfs seasi...,CA,Santa Barbara,4.0,258,452,125,183,201520162017,37,3.99
4,4,gasLVm0KRwrVhPGRcqATjw,CHh0ZFrQcsk4boOItr2Zuw,4.0,1,2,0,i stopped i hungry snacks browsed store since ...,CA,Carpinteria,3.0,1638,2658,933,2297,20142015201620172018201920202021,105,3.79


In [24]:
X_train, X_test, y_train, y_test = train_test_split(np.array(data.Review), np.array(data.Star), test_size=0.3, random_state=8)

# Modeling

## Baseline Model
In this model, we use the mean star of the training set to use as the prediction, regardless of the review text. In this way, we can create a baseline model on which further more complex model can improve on.

In [26]:
y_hat = y_train.mean()

In [28]:
y_hat

4.008039054416174

In [31]:
r2_score(y_test, [y_hat] * len(y_test))

-3.435187043443477e-05

In [33]:
mean_squared_error(y_test, [y_hat] * len(y_test))

1.134924954168833

## TF-IDF + Linear Regression with CV

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

In [39]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [43]:
m_lr = LinearRegression()
# params = {"knn__n_neighbors": [1, 3, 5, 7, 12, 15, 20]}
# grids = GridSearchCV(m_lr, params, cv=5)
# grids.fit(X_train, y_train)

In [44]:
m_lr.fit(X_train_tfidf, y_train)

In [45]:
y_hat = m_lr.predict(X_test_tfidf)

In [48]:
r2_score(y_test, y_hat)

-34.688474884191564

In [49]:
mean_squared_error(y_test, y_hat)

40.5023493908381

In [53]:
y_hat.mean()

3.979502951636717