In [1]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import string
from sklearn import linear_model
import math

def parseData(fname):
  for l in open(fname):
    yield eval(l)

In [2]:
### Just the first 5000 reviews

print "Reading data..."
data = list(parseData("beer_50000.json"))[:5000]
print "done"

Reading data...
done


In [3]:
### Ignore capitalization and remove punctuation
unigram_count = defaultdict(int)
bigram_count = defaultdict(int)
hybrid_count = defaultdict(int)

punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    words = r.split()
    for w1, w2 in zip(words[:-1], words[1:]):
        unigram_count[w1] += 1
        bigram_count[w1 + ' ' + w2] += 1
    if len(words) > 0:
        unigram_count[words[-1]] += 1

hybrid_count.update(unigram_count)
hybrid_count.update(bigram_count)

# print len(unigram_count), "unique unigrams"
print len(bigram_count), "unique bigrams"
# print len(hybrid_count), "unique hybrids"

182246 unique bigrams


In [10]:
### Just take the most popular words...

bigram_counts = [(bigram_count[w], w) for w in bigram_count]
bigram_counts.sort()
bigram_counts.reverse()
bigrams = [x[1] for x in bigram_counts[:1000]]

print "Most popular bigrams"
print ', '.join([bigram + ': ' + str(count) for count, bigram in bigram_counts[:5]])

Most popular bigrams
with a: 4587, in the: 2595, of the: 2245, is a: 2056, on the: 2033


In [11]:
### Sentiment analysis

bigramId = dict(zip(bigrams, range(len(bigrams))))
bigramSet = set(bigrams)

def feature_bigrams(datum):
    feat = [0]*len(bigrams)
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w1, w2 in zip(r.split()[:-1], r.split()[1:]):
        if (w1 + ' ' + w2) in bigrams:
            feat[bigramId[w1 + ' ' + w2]] += 1
    feat.append(1) #offset
    return feat

X = [feature_bigrams(d) for d in data]
y = [d['review/overall'] for d in data]

#No regularization
#theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
print numpy.dot(predictions - y, predictions - y)

1715.76507031


In [15]:
hybrid_counts = [(hybrid_count[w], w) for w in hybrid_count]
hybrid_counts.sort()
hybrid_counts.reverse()
hybrids = [x[1] for x in hybrid_counts[:1000]]

print "Most popular unigrams + bigrams"
print hybrid_counts[:5]

Most popular unigrams + bigrams
[(30695, 'a'), (27569, 'the'), (19512, 'and'), (15935, 'of'), (12623, 'is')]


In [7]:
hybridId = dict(zip(hybrids, range(len(hybrids))))
hybridSet = set(hybrids)

def feature_hybrids(datum):
    feat = [0]*len(hybrids)
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w1, w2 in zip(r.split()[:-1], r.split()[1:]):
        if w1 in hybrids:
            feat[hybridId[w1]] += 1
        if w2 in hybrids:
            feat[hybridId[w2]] += 1
        if (w1 + ' ' + w2) in hybrids:
            feat[hybridId[w1 + ' ' + w2]] += 1
    feat.append(1) #offset
    return feat

X = [feature_hybrids(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
print numpy.dot(predictions - y, predictions - y)

1449.76873653


In [8]:
sentiments = sorted(zip(theta[:-1], hybrids))
sentiments[:5], sentiments[-5:]

([(-0.69265745727633066, 'sort of'),
  (-0.22848928122639411, 'the background'),
  (-0.17568277356502954, 'around the'),
  (-0.16972494240234998, 'down the'),
  (-0.14204940691157014, 'i will')],
 [(0.20527676503518508, 'the best'),
  (0.20745459134343039, 'not bad'),
  (0.21912455033569869, 'of these'),
  (0.23171208948268693, 'a bad'),
  (0.28954668154631069, 'sort')])

In [6]:
words = ["foam", "smell", "banana", "lactic", "tart"]

def remove_punct(review):
    return ''.join([c for c in review['review/text'].lower() if not c in string.punctuation]).split()

def idf(word):
    return math.log10(float(len(data)) / unigram_count[word]) if unigram_count[word] > 0 else 0

def tf_idf(word, review):
    freq = sum([1 for w in remove_punct(review) if w == word])
    return freq * idf(word)

print "Word\t\tIDF\t\tTF-IDF"
for word in words:
    print word, '\t', idf(word), '\t', tf_idf(word, data[0])

Word		IDF		TF-IDF
foam 	1.10902040301 	2.21804080602
smell 	0.450751443146 	0.450751443146
banana 	1.51999305704 	3.03998611409
lactic 	2.7447274949 	5.48945498979
tart 	1.71219827007 	1.71219827007


In [5]:
unigram_counts = [(unigram_count[w], w) for w in unigram_count]
unigram_counts.sort()
unigram_counts.reverse()
unigrams = [x[1] for x in unigram_counts[:1000]]

In [7]:
tf_idf_0 = map(lambda word: tf_idf(word, data[0]), unigrams)
tf_idf_1 = map(lambda word: tf_idf(word, data[1]), unigrams)
print "First/second review cosine similarity", numpy.dot(tf_idf_0, tf_idf_1) / (numpy.linalg.norm(tf_idf_0) * numpy.linalg.norm(tf_idf_1))

First/second review cosine similarity 0.400835532702


In [12]:
tf_idf_0 = map(lambda word: tf_idf(word, data[0]), unigrams)
closest_review = None
closest_similarity = -1

counter = 0

for d in data[1:]:
    tf_idf_curr = map(lambda word: tf_idf(word, d), unigrams)
    similarity = numpy.dot(tf_idf_curr, tf_idf_0) / (numpy.linalg.norm(tf_idf_curr) * numpy.linalg.norm(tf_idf_0))
    if similarity > closest_similarity:
        closest_review = d
        closest_similarity = similarity
    if counter % 1000 == 0:
        print counter, "reviews processed"
    counter += 1
print closest_review, closest_similarity

0 reviews processed
{'beer/style': 'Herbed / Spiced Beer', 'beer/ABV': 4.7, 'beer/beerId': '52159', 'review/timeStruct': {'wday': 0, 'isdst': 0, 'mday': 2, 'hour': 17, 'min': 17, 'sec': 39, 'year': 2012, 'yday': 2, 'mon': 1}, 'review/aroma': 3.5, 'review/appearance': 3.5, 'review/timeUnix': 1325524659, 'review/palate': 3.0, 'review/taste': 3.5, 'beer/name': 'Caldera Ginger Beer', 'user/gender': 'Male', 'beer/brewerId': '1075', 'review/overall': 3.0, 'review/text': "Poured from the bottle into a Chimay goblet.\t\tAppearance: Pours a slightly cloudy yellow/orange color with a half finger of fluffy white head. The head fades to a small layer on top of the pour.\t\tSmell: Very light and crisp. I'm definitely picking up the ginger, but it's not overly powerful. There is a slight sweetness from the malt as well.\t\tTaste: Very light and refreshing. The ginger shows up right away and then fades towards the finish of the sip. The finish is malty and bread like. \t\tMouthfeel: The body is on th

In [15]:
# tf_idf_X = [[map(lambda word: tf_idf(word, d), unigrams) for word in unigrams] + [1] for d in data]

tf_idf_X = []
counter = 0

for d in data:
    tf_idf_X.append(map(lambda word: tf_idf(word, d), unigrams) + [1])
    if counter % 200 == 0:
        print counter, "reviews processed"
    counter += 1

0 reviews processed
200 reviews processed
400 reviews processed
600 reviews processed
800 reviews processed
1000 reviews processed
1200 reviews processed
1400 reviews processed
1600 reviews processed
1800 reviews processed
2000 reviews processed
2200 reviews processed
2400 reviews processed
2600 reviews processed
2800 reviews processed
3000 reviews processed
3200 reviews processed
3400 reviews processed
3600 reviews processed
3800 reviews processed
4000 reviews processed
4200 reviews processed
4400 reviews processed
4600 reviews processed
4800 reviews processed


In [16]:
len(tf_idf_X[0])

1001

In [17]:
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(tf_idf_X, y)
theta = clf.coef_
predictions = clf.predict(tf_idf_X)
print numpy.dot(predictions - y, predictions - y)

1395.26732133
