In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model

In [2]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)
        
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        # yield eval(l)
        d = eval(l)
        if 'review/text' not in l or 'review/overall' not in l:
            continue        
        rating = d['review/overall']
        
        # rating takes the form of XX/20
        numerator = rating.split('/')[0]
        denominator = rating.split('/')[1]
        rating = float(numerator) / float(denominator)       
        
        text = d['review/text']
        yield rating, text
        

dataset = []
for l in readJSON("ratebeer.json.gz"):
    dataset.append(l)
        


In [3]:
num_review = len(dataset)
train_set_size = int(0.9*num_review)
valid_set_size = int(0.95*num_review)
train_set = dataset[:train_set_size]                    # 90% of the data is used for training
valid_set = dataset[train_set_size:valid_set_size]      # 5% of the data is used for validation
test_set = dataset[valid_set_size:]                     # 5% of the data is used for testing

print("Number of reviews in the dataset: ", num_review)
print("Number of reviews in the training set: ", len(train_set))
print("Number of reviews in the validation set: ", len(valid_set))
print("Number of reviews in the test set: ", len(test_set))

Number of reviews in the dataset:  2924162
Number of reviews in the training set:  2631745
Number of reviews in the validation set:  146208
Number of reviews in the test set:  146209


In [4]:
def getFeature(train_set, valid_set, test_set, feat_func):
    X_train = [feat_func(d) for d in train_set]
    X_valid = [feat_func(d) for d in valid_set]
    X_test = [feat_func(d) for d in test_set]
    y_train = [d[0] for d in train_set]
    y_valid = [d[0] for d in valid_set]
    y_test = [d[0] for d in test_set]
    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [5]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

$\textbf{Explore the dataset}$

In [6]:
print(dataset[0][0], dataset[0][1])

0.65 On tap at the John Harvards in Springfield PA.  Pours a ruby red amber with a medium off whie creamy head that left light lacing.  Aroma of orange and various other citrus.  A little light for what I was expecting from this beers aroma...expecting more from the Simcoe.  Flavor of pine, orange, grapefruit and some malt balance.  Very light bitterness for the 80+ IBUs they said this one had.


In [7]:
len(dataset)

2924162

$\textbf{Baseline model}$  
For baseline model, we will predict the rating of review based on the review length. The model will be trained with logistic regression. We will use the mean squared error as the loss function.

In [8]:
max_review_length = 0
for d in dataset:
    max_review_length = max(max_review_length, len(d[1]))
print(max_review_length)

11455


In [9]:
def feature_baseline(datum):
    review_length = len(datum[1])
    return [1, review_length/max_review_length]

X_train_baseline, X_valid_baseline, X_test_baseline, y_train_baseline, y_valid_baseline, y_test_baseline = getFeature(train_set, valid_set, test_set, feature_baseline)

print(X_train_baseline[0])


[1, 0.0342208642514186]


In [10]:
model_baseline = linear_model.LinearRegression()
model_baseline.fit(X_train_baseline, y_train_baseline)
predictions_baseline_valid = model_baseline.predict(X_valid_baseline)
mse_baseline_valid = MSE(predictions_baseline_valid, y_valid_baseline)

predictions_baseline_test = model_baseline.predict(X_test_baseline)
mse_baseline_test = MSE(predictions_baseline_test, y_test_baseline)

print("MSE on validation set: ", mse_baseline_valid)


random_predictions = [random.random()  for d in valid_set]
mse_random = MSE(random_predictions, y_valid_baseline)
print("MSE on validation set (random): ", mse_random)

average_predictions = [0.5 for d in valid_set]
mse_average = MSE(average_predictions, y_valid_baseline)
print("MSE on validation set (average): ", mse_average)

print("MSE on test set: ", mse_baseline_test)

MSE on validation set:  0.021831926824040445
MSE on validation set (random):  0.13312394961353366
MSE on validation set (average):  0.04983475596412937
MSE on test set:  0.026860493071557183


$\textbf{Advanced model}$  
Our baseline model makes prediction based on the review length. It is better than making random prediction between 0 and 1 or always predicting 0.5. However, We can see that it is somehow naive. We will try to improve the model by analyzing the review text. We will try to understand each word in the review, like how the existence of a word affects the rating. We will use the word embedding technique to convert the review text into a vector. Then, we will use the vector as the input of the model. We will still use logistic regression as the model. We will use the mean squared error as the loss function.

In [12]:
import gzip
import math
import matplotlib.pyplot as plt
import numpy
import random
import sklearn
import string
from collections import defaultdict
from gensim.models import Word2Vec
from nltk.stem.porter import *
from sklearn import linear_model
from sklearn.manifold import TSNE

In [16]:
# bag of words
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in train_set[:10000]:
  r = ''.join([c for c in d[1].lower() if not c in punctuation])
  for w in r.split():
    w = stemmer.stem(w)
    wordCount[w] += 1
    
len(wordCount)

17165

In [18]:
print(wordCount)

defaultdict(<class 'int'>, {'on': 2527, 'tap': 425, 'at': 2067, 'the': 18049, 'john': 8, 'harvard': 3, 'in': 5123, 'springfield': 14, 'pa': 29, 'pour': 3651, 'a': 22424, 'rubi': 236, 'red': 956, 'amber': 1064, 'with': 13268, 'medium': 1962, 'off': 647, 'whie': 1, 'creami': 1170, 'head': 6775, 'that': 3355, 'left': 173, 'light': 3395, 'lace': 1189, 'aroma': 6247, 'of': 12333, 'orang': 783, 'and': 18671, 'variou': 17, 'other': 639, 'citru': 685, 'littl': 1486, 'for': 2091, 'what': 464, 'i': 5430, 'wa': 2729, 'expect': 455, 'from': 1578, 'thi': 5064, 'beer': 5229, 'aromaexpect': 1, 'more': 1446, 'simco': 2, 'flavor': 4806, 'pine': 184, 'grapefruit': 171, 'some': 3483, 'malt': 4148, 'balanc': 1016, 'veri': 4407, 'bitter': 2638, '80': 6, 'ibu': 12, 'they': 218, 'said': 78, 'one': 1591, 'had': 1155, 'updat': 934, 'feb': 91, '19': 39, '2003': 125, 'ive': 363, 'never': 158, 'budvar': 1, 'cristal': 2, 'but': 4772, 'is': 10228, 'exactli': 27, 'imagin': 32, 'it': 6439, 'to': 6030, 'be': 1200, 'cl