In [1]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import string
from sklearn import linear_model
import math

def parseData(fname):
  for l in open(fname):
    yield eval(l)

In [2]:
### Just the first 5000 reviews

print "Reading data..."
data = list(parseData("beer_50000.json"))[:5000]
print "done"

Reading data...
done


In [3]:
### Ignore capitalization and remove punctuation
unigram_count = defaultdict(int)
bigram_count = defaultdict(int)
hybrid_count = defaultdict(int)

punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    words = r.split()
    for w1, w2 in zip(words[:-1], words[1:]):
        unigram_count[w1] += 1
        bigram_count[w1 + ' ' + w2] += 1
    if len(words) > 0:
        unigram_count[words[-1]] += 1

hybrid_count.update(unigram_count)
hybrid_count.update(bigram_count)

# print len(unigram_count), "unique unigrams"
print len(bigram_count), "unique bigrams"
# print len(hybrid_count), "unique hybrids"

182246 unique bigrams


In [12]:
### Just take the most popular words...

bigram_counts = [(bigram_count[w], w) for w in bigram_count]
bigram_counts.sort()
bigram_counts.reverse()
bigrams = [x[1] for x in bigram_counts[:1000]]

print "Most popular bigrams"
print ', '.join([bigram + ': ' + str(count) for count, bigram in bigram_counts[:5]])

Most popular bigrams
with a: 4587, in the: 2595, of the: 2245, is a: 2056, on the: 2033


In [13]:
### Sentiment analysis

bigramId = dict(zip(bigrams, range(len(bigrams))))
bigramSet = set(bigrams)

def feature_bigrams(datum):
    feat = [0]*len(bigrams)
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w1, w2 in zip(r.split()[:-1], r.split()[1:]):
        if (w1 + ' ' + w2) in bigrams:
            feat[bigramId[w1 + ' ' + w2]] += 1
    feat.append(1) #offset
    return feat

X = [feature_bigrams(d) for d in data]
y = [d['review/overall'] for d in data]

#No regularization
#theta,residuals,rank,s = numpy.linalg.lstsq(X, y)

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
print numpy.dot(predictions - y, predictions - y) / len(predictions)

0.343153014061


In [14]:
hybrid_counts = [(hybrid_count[w], w) for w in hybrid_count]
hybrid_counts.sort()
hybrid_counts.reverse()
hybrids = [x[1] for x in hybrid_counts[:1000]]

print "Most popular unigrams + bigrams"
print hybrid_counts[:5]

Most popular unigrams + bigrams
[(30695, 'a'), (27569, 'the'), (19512, 'and'), (15935, 'of'), (12623, 'is')]


In [7]:
hybridId = dict(zip(hybrids, range(len(hybrids))))
hybridSet = set(hybrids)

def feature_hybrids(datum):
    feat = [0]*len(hybrids)
    r = ''.join([c for c in d['review/text'].lower() if not c in punctuation])
    for w1, w2 in zip(r.split()[:-1], r.split()[1:]):
        if w1 in hybrids:
            feat[hybridId[w1]] += 1
        if w2 in hybrids:
            feat[hybridId[w2]] += 1
        if (w1 + ' ' + w2) in hybrids:
            feat[hybridId[w1 + ' ' + w2]] += 1
    feat.append(1) #offset
    return feat

X = [feature_hybrids(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)
print numpy.dot(predictions - y, predictions - y) / len(predictions)

0.289953747306


In [8]:
sentiments = sorted(zip(theta[:-1], hybrids))
sentiments[:5], sentiments[-5:]

([(-0.69265745727633066, 'sort of'),
  (-0.22848928122639411, 'the background'),
  (-0.17568277356502954, 'around the'),
  (-0.16972494240234998, 'down the'),
  (-0.14204940691157014, 'i will')],
 [(0.20527676503518508, 'the best'),
  (0.20745459134343039, 'not bad'),
  (0.21912455033569869, 'of these'),
  (0.23171208948268693, 'a bad'),
  (0.28954668154631069, 'sort')])

In [7]:
words = ["foam", "smell", "banana", "lactic", "tart"]

def remove_punct(review):
    return ''.join([c for c in review['review/text'].lower() if not c in string.punctuation]).split()

doc_count = defaultdict(int)
for r in data:
    for word in words:
        if word in remove_punct(r):
            doc_count[word] += 1

def idf(word):
    return math.log10(float(len(data)) / doc_count[word])

def tf_idf(word, review):
    freq = sum([1 for w in remove_punct(review) if w == word])
    return freq * idf(word)

print "Word\t\tIDF\t\tTF-IDF"
for word in words:
    print word, '\t', idf(word), '\t', tf_idf(word, data[0])

Word		IDF		TF-IDF
foam 	1.13786862069 	2.27573724137
smell 	0.537901618865 	0.537901618865
banana 	1.67778070527 	3.35556141053
lactic 	2.92081875395 	5.8416375079
tart 	1.80687540165 	1.80687540165


In [12]:
unigram_counts = [(unigram_count[w], w) for w in unigram_count]
unigram_counts.sort()
unigram_counts.reverse()
unigrams = [x[1] for x in unigram_counts[:1000]]

counter = 0

doc_count = defaultdict(int)

In [14]:
for r in data:
    for word in unigrams:
        if word in remove_punct(r):
            doc_count[word] += 1
    if counter % 200 == 0:
        print counter, "reviews processed"
    counter += 1

0 reviews processed
200 reviews processed
400 reviews processed
600 reviews processed
800 reviews processed
1000 reviews processed
1200 reviews processed
1400 reviews processed
1600 reviews processed
1800 reviews processed
2000 reviews processed
2200 reviews processed
2400 reviews processed
2600 reviews processed
2800 reviews processed
3000 reviews processed
3200 reviews processed
3400 reviews processed
3600 reviews processed
3800 reviews processed
4000 reviews processed
4200 reviews processed
4400 reviews processed
4600 reviews processed
4800 reviews processed


In [47]:
tf_idf_0 = map(lambda word: tf_idf(word, data[0]), unigrams)
tf_idf_1 = map(lambda word: tf_idf(word, data[1]), unigrams)
print "First/second review cosine similarity", numpy.dot(tf_idf_0, tf_idf_1) / (numpy.linalg.norm(tf_idf_0) * numpy.linalg.norm(tf_idf_1))

First/second review cosine similarity 0.106130241679


NameError: name 'tf_idf_X' is not defined

In [49]:
tf_idf_1

[0.01836925517636642,
 0.14454893431442267,
 0.13055704013620106,
 0.05158703422139899,
 0.0,
 0.062181315301644016,
 0.0,
 0.0,
 0.0,
 0.0,
 0.8741681387172515,
 0.1663430031071276,
 0.23957751657678797,
 0.0,
 0.0,
 0.0,
 0.8469884909932134,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.2914091548496563,
 0.0,
 0.0,
 0.0,
 1.7470251954384077,
 1.162920156096677,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5512936800949202,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.46067293646062496,
 0.0,
 0.0,
 0.0,
 0.0,
 0.5379016188648442,
 0.0,
 0.0,
 0.0,
 0.0,
 0.6356366453842692,
 0.0,
 0.0,
 0.5556428743439723,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.3432407931225245,
 0.0,
 0.0,
 0.0,
 0.6315271615596382,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,


In [34]:
tf_idf_0 = map(lambda word: tf_idf(word, data[0]), unigrams)
closest_review = None
closest_similarity = -1

counter = 0

for d in data[1:]:
    tf_idf_curr = map(lambda word: tf_idf(word, d), unigrams)
    similarity = numpy.dot(tf_idf_curr, tf_idf_0) / (numpy.linalg.norm(tf_idf_curr) * numpy.linalg.norm(tf_idf_0))
    if similarity > closest_similarity:
        closest_review = d
        closest_similarity = similarity
    if counter % 200 == 0:
        print counter, "reviews processed"
    counter += 1
print closest_review, closest_similarity

0 reviews processed
200 reviews processed
400 reviews processed
600 reviews processed
800 reviews processed
1000 reviews processed
1200 reviews processed
1400 reviews processed
1600 reviews processed
1800 reviews processed
2000 reviews processed
2200 reviews processed
2400 reviews processed
2600 reviews processed
2800 reviews processed
3000 reviews processed
3200 reviews processed
3400 reviews processed




3600 reviews processed
3800 reviews processed
4000 reviews processed
4200 reviews processed
4400 reviews processed
4600 reviews processed
4800 reviews processed
{'beer/style': 'Pumpkin Ale', 'beer/ABV': 8.4, 'beer/beerId': '52211', 'review/timeStruct': {'wday': 0, 'isdst': 0, 'mday': 14, 'hour': 0, 'min': 24, 'sec': 50, 'year': 2011, 'yday': 318, 'mon': 11}, 'review/aroma': 5.0, 'review/appearance': 4.0, 'review/timeUnix': 1321230290, 'review/palate': 3.5, 'review/taste': 4.0, 'beer/name': "Frog's Hollow Double Pumpkin Ale", 'beer/brewerId': '14879', 'review/overall': 4.0, 'review/text': 'Poured from a 22oz bottle to a Dogfish Head Snifter.\t\tColor: Slight hazy orange with an off white head.\t\tSmell: Cinnamon, banana, pumpkin and nutmeg.\t\tTaste: Alcohol, pumpkin, nutmeg, allspice and a hint of banana.\t\tMouthfeel: Medium carbonation, smooth, medium dryness on the palate.\t\tOverall: The smell is GREAT! The banana was a huge surprise for me. The taste had too much alcohol presence.

In [35]:
# tf_idf_X = [[map(lambda word: tf_idf(word, d), unigrams) for word in unigrams] + [1] for d in data]

tf_idf_X = []
counter = 0

for d in data:
    tf_idf_X.append(map(lambda word: tf_idf(word, d), unigrams) + [1])
    if counter % 200 == 0:
        print counter, "reviews processed"
    counter += 1

0 reviews processed
200 reviews processed
400 reviews processed
600 reviews processed
800 reviews processed
1000 reviews processed
1200 reviews processed
1400 reviews processed
1600 reviews processed
1800 reviews processed
2000 reviews processed
2200 reviews processed
2400 reviews processed
2600 reviews processed
2800 reviews processed
3000 reviews processed
3200 reviews processed
3400 reviews processed
3600 reviews processed
3800 reviews processed
4000 reviews processed
4200 reviews processed
4400 reviews processed
4600 reviews processed
4800 reviews processed


In [36]:
len(tf_idf_X)

5000

In [43]:
y = [d['review/overall'] for d in data]
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(tf_idf_X, y)
theta = clf.coef_
predictions = clf.predict(tf_idf_X)
print numpy.dot(predictions - y, predictions - y) / len(predictions)

0.278759560078


In [19]:
len(predictions)

5000