In [6]:
import gzip
import random
import numpy as np
import string
import operator
from collections import defaultdict
from sklearn import svm

def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

In [15]:
# Category prediction
data = []
labels = []

for l in readGz("train.json.gz"):
    feature = [l['reviewerID'], l['reviewText'], l['summary']]
    feature.extend([0, l['price']] if 'price' in l else [1, 0])
    data.append(feature) # Feature matrix
    labels.append(l['categoryID'])

pairs = zip(data, labels)
random.seed(0)
random.shuffle(pairs)
data, labels = zip(*pairs)

train_data, train_labels = data[:len(data) / 2], labels[:len(labels) / 2]
valid_data, valid_labels = data[len(data) / 2:], labels[len(labels) / 2:]

In [8]:
def get_keywords(data, labels, index):
    word_count = defaultdict(int)
    word_count_category = defaultdict(lambda : defaultdict(int))
    punctuation = set(string.punctuation)
    for text, category in zip([d[index] for d in data], labels):
        r = ''.join([c if c not in punctuation else ' ' for c in text.lower()])
        for word in r.split():
            word_count[word] += 1
            word_count_category[category][word] += 1

    top_words = sorted([(word_count[word], word) for word in word_count], reverse=True)[:1000]
    total = float(sum([w[0] for w in top_words]))
    top_freq = [(w[0] / total, w[1]) for w in top_words]

    top_words_category = defaultdict(list)
    for count, word in top_words:
        for category in word_count_category:
            top_words_category[category].append((word_count_category[category][word], word))
    total_category = {category:float(sum(zip(*top_words_category[category])[0])) for category in top_words_category}
    top_freq_category = {category:[(count / total_category[category], word) for count, word in top_words_category[category]] for category in top_words_category}

    more_freq = {category:sorted([(category_freq[0] - freq[0], freq[1]) for category_freq, freq in zip(top_freq_category[category], top_freq)], reverse=True)[:100] for category in top_freq_category}
    return {category:set([pair[1] for pair in more_freq[category]]) for category in more_freq}

text_keywords = get_keywords(train_data, train_labels, 1)
summary_keywords = get_keywords(train_data, train_labels, 2)

In [9]:
user_category_counts = defaultdict(lambda: [0] * 5)
for data, label in zip(train_data, train_labels):
    user_category_counts[data[0]][label] += 1

def user_keyword_counts(keywords, index):
    counts = defaultdict(lambda: [0] * 5)
    for data, label in zip(train_data, train_labels):
        punctuation = set(string.punctuation)
        r = ''.join([c if c not in punctuation else ' ' for c in data[index].lower()])
        for word in r.split():
            for category in keywords:
                if word in keywords[category]:
                    counts[data[0]][category] += 1
    for user in counts:
        total = float(sum(counts[user]))
        for i in xrange(5):
            counts[user][i] /= total
    return counts

user_review_keyword_counts = user_keyword_counts(text_keywords, 1)
user_summary_keyword_counts = user_keyword_counts(summary_keywords, 2)

In [11]:
def feature(l, text_keywords, summary_keywords):
    vector = []
    vector.extend([sum([word in l[1] for word in text_keywords[category]]) for category in text_keywords])
    vector.extend([sum([word in l[2] for word in summary_keywords[category]]) for category in summary_keywords])
    vector.extend(([0] + user_category_counts[l[0]]) if l[0] in user_category_counts else [1, 0, 0, 0, 0, 0])
    vector.extend(([0] + user_review_keyword_counts[l[0]]) if l[0] in user_review_keyword_counts else [1, 0, 0, 0, 0, 0])
    vector.extend(([0] + user_summary_keyword_counts[l[0]]) if l[0] in user_summary_keyword_counts else [1, 0, 0, 0, 0, 0])
    vector.append(l[3])
    vector.append(l[4])
    return vector

train_features, valid_features = [], []

for l in train_data:
    train_features.append(feature(l, text_keywords, summary_keywords))

for l in valid_data:
    valid_features.append(feature(l, text_keywords, summary_keywords))

best_lambda, best_accuracy = None, 0
# for reg in [0.01, 0.1, 1, 10, 100]:
for reg in [0.01]:
    clf = svm.LinearSVC(C=reg)
    clf.fit(train_features, train_labels)
    scores = clf.decision_function(valid_features)
    predict_valid = np.argmax(scores, axis=1)

    accuracy = len(filter(lambda pair: pair[0] == pair[1], zip(predict_valid, valid_labels))) / float(len(predict_valid))
    print "Accuracy w/ lambda", reg, "=", accuracy
    if accuracy > best_accuracy:
        best_lambda, best_accuracy = reg, accuracy

Accuracy w/ lambda 0.01 = 0.81094


In [12]:
review = []
test_data = []
for l in readGz("test_Category.json.gz"):
    review.append([l['reviewerID'], l['reviewHash']])
    f = [l['reviewerID'], l['reviewText'], l['summary']]
    f.extend([0, l['price']] if 'price' in l else [1, 0])
    test_data.append(feature(f, text_keywords, summary_keywords))

clf = svm.LinearSVC(C=best_lambda)
clf.fit(train_features, train_labels)
scores = clf.decision_function(test_data)
predict_test = np.argmax(scores, axis=1)

predictions = open("predictions_Category.txt", 'w')
predictions.write("userID-reviewHash,category\n")
for key, value in zip(review, predict_test):
    predictions.write(key[0] + '-' + key[1] + ',' + str(value) + '\n')
predictions.close()