In [31]:
import gzip
import random
import numpy as np
import string
import operator
from collections import defaultdict
from sklearn import svm

def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

# Category prediction
data = []
labels = []

for l in readGz("train.json.gz"):
    data.append([l['reviewerID'], l['reviewText']]) # Feature matrix
    labels.append(l['categoryID'])

pairs = zip(data, labels)
random.seed(0)
random.shuffle(pairs)
data, labels = zip(*pairs)

train_data, train_labels = data[:len(data) / 2], labels[:len(labels) / 2]
valid_data, valid_labels = data[len(data) / 2:], labels[len(labels) / 2:]

In [32]:
category_freq = defaultdict(int)
user_category_freq = defaultdict(lambda : defaultdict(int))

for point, category in zip(train_data, train_labels):
    category_freq[category] += 1
    user_category_freq[point[0]][category] += 1

valid_accuracy = 0
for point, label in zip(valid_data, valid_labels):
    purchases = user_category_freq[point[0]]
    most_category, most_count = 0, 0
    for category in purchases.keys():
        if purchases[category] > most_count:
            most_category, most_count = category, purchases[category]
    valid_accuracy += 1 if most_category == label else 0
print valid_accuracy * 1.0 / len(valid_labels)

0.80077


In [33]:
word_count = defaultdict(int)
word_count_category = defaultdict(lambda : defaultdict(int))
punctuation = set(string.punctuation)
for text, category in zip([d[1] for d in train_data], train_labels):
    r = ''.join([c for c in text.lower() if not c in punctuation])
    for word in r.split():
        word_count[word] += 1
        word_count_category[category][word] += 1

top_words = sorted([(word_count[word], word) for word in word_count], reverse=True)[:500]
total = float(sum([w[0] for w in top_words]))
top_freq = [(w[0] / total, w[1]) for w in top_words]

top_words_category = defaultdict(list)
for count, word in top_words:
    for category in word_count_category:
        top_words_category[category].append((word_count_category[category][word], word))
total_category = {category:float(sum(zip(*top_words_category[category])[0])) for category in top_words_category}
top_freq_category = {category:[(count / total_category[category], word) for count, word in top_words_category[category]] for category in top_words_category}

more_freq = {category:sorted([(category_freq[0] - freq[0], freq[1]) for category_freq, freq in zip(top_freq_category[category], top_freq)], reverse=True)[:10] for category in top_freq_category}
print [(category, [pair[1] for pair in more_freq[category]]) for category in more_freq]

[(0, ['i', 'it', 'love', 'bra', 'wear', 'so', 'but', 'size', 'cute', 'was']), (1, ['he', 'watch', 'the', 'of', 'for', 'good', 'husband', 'you', 'these', 'his']), (2, ['she', 'her', 'daughter', 'for', 'it', 'my', 'old', 'loves', 'year', 'little']), (3, ['he', 'son', 'for', 'my', 'old', 'his', 'we', 'year', 'him', 'loves']), (4, ['these', 'are', 'for', 'they', 'her', 'cute', 'and', 'my', 'old', 'we'])]


In [34]:
train_pairs = [([word in point[1] for word in zip(*top_words)[1]], label) for point, label in zip(train_data[:5000], train_labels[:5000]) if label <= 1]
valid_pairs = [([word in point[1] for word in zip(*top_words)[1]], label) for point, label in zip(valid_data[:5000], valid_labels[:5000]) if label <= 1]

for reg in [0.01, 0.1, 1, 10, 100]:
    clf = svm.SVC(C=reg)
    clf.fit(zip(*train_pairs)[0], zip(*train_pairs)[1])
    predict_valid = clf.predict(zip(*valid_pairs)[0])
    print "Accuracy w/ lambda", reg, "=", len(filter(lambda pair: pair[0] == pair[1], zip(predict_valid, valid_labels))) / float(len(predict_valid))

Accuracy w/ lambda 0.01 = 0.705870151421
Accuracy w/ lambda 0.1 = 0.705870151421
Accuracy w/ lambda 1 = 0.705870151421
Accuracy w/ lambda 10 = 0.668533499274
Accuracy w/ lambda 100 = 0.630159717901


In [35]:
train_pairs = [([word in point[1] for word in zip(*top_words)[1]], label) for point, label in zip(train_data[:5000], train_labels[:5000])]
valid_pairs = [([word in point[1] for word in zip(*top_words)[1]], label) for point, label in zip(valid_data[:5000], valid_labels[:5000])]

best_lambda, best_accuracy = None, 0
for reg in [0.01, 0.1, 1, 10, 100]:
    clf = svm.LinearSVC(C=reg)
    clf.fit(zip(*train_pairs)[0], zip(*train_pairs)[1])
    scores = clf.decision_function(zip(*valid_pairs)[0])
    predict_valid = np.argmax(scores, axis=1)

    accuracy = len(filter(lambda pair: pair[0] == pair[1], zip(predict_valid, valid_labels))) / float(len(predict_valid))
    print "Accuracy w/ lambda", reg, "=", accuracy
    if accuracy > best_accuracy:
        best_lambda, best_accuracy = reg, accuracy

Accuracy w/ lambda 0.01 = 0.7534
Accuracy w/ lambda 0.1 = 0.7496
Accuracy w/ lambda 1 = 0.7376
Accuracy w/ lambda 10 = 0.731
Accuracy w/ lambda 100 = 0.6906


In [38]:
review = []
test_data = []
for l in readGz("test_Category.json.gz"):
    review.append([l['reviewerID'], l['reviewHash']])
    feature = [word in l['reviewText'] for word in zip(*top_words)[1]]
    test_data.append(feature)

clf = svm.LinearSVC(C=best_lambda)
clf.fit(zip(*train_pairs)[0], zip(*train_pairs)[1])
scores = clf.decision_function(test_data)
predict_test = np.argmax(scores, axis=1)

predictions = open("predictions_Category.txt", 'w')
predictions.write("userID-reviewHash,category\n")
for key, value in zip(review, predict_test):
    predictions.write(key[0] + '-' + key[1] + ',' + str(value) + '\n')
predictions.close()