In [1]:
# Final report for challenge:
# https://docs.google.com/document/d/1zvWnFWvYVeoPrWs97bt5d9E2DJiMhW87jhdRkRYvi_E/edit#

In [None]:
%load_ext autoreload
%autoreload 2

from collections import defaultdict, Counter
import load_data
import json
import time
import math

from pymagnitude import Magnitude
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
import numpy as np
import business_classification
import rating_prediction

In [None]:
data_dir = "/Users/vijay/Documents/vijay/yelp-dataset"
reviews_file = data_dir + "/yelp_academic_dataset_review.json"
fasttext_data_dir = "/Users/vijay/Documents/vijay/datascience-challenge"

In [None]:
reviews = []
for row in open(reviews_file):
    review = json.loads(row)
    reviews.append(review)

In [None]:
## Rating Prediction

In [None]:
train, test = load_data.train_test_split(reviews)

In [None]:
del reviews

In [None]:
train_out = fasttext_data_dir + "/train.fasttext"
load_data.preprocess_for_fasttext(train_data, train_out)

test_out = fasttext_data_dir + "/test.fasttext"
load_data.preprocess_for_fasttext(test_data, test_out)
rating_prediction.prepare_fast_text_data(fasttext_data_dir, train, test)

test_labels_out = fasttext_data_dir + "/test.fasttext.label_only"
test_labels_only = open(test_labels_out)
for review in test_data:
    label = review['stars']
    # fast text expects the label formatted as __label__1 for label 1
    annotated_line = "__label__%s %s\n" % (label, text)
    test_labels_only.write(annotated_line)
    lines += 1
    
test_labels_only.close()

print("now `cd %s`\n" % fasttext_data_dir)
print("and then run `../fastText-0.1.0/fasttext supervised -input train.fasttext -output model` to train the model")
print("and `../fastText-0.1.0/fasttext predict model.bin test.fasttext > test.predictions` to write predictions to file")


In [None]:
# Baseline A: predict the most frequent label:
train_labels = [review['stars'] for review in train]
actual_labels = [review['stars'] for review in test]

# Baseline: what if we just chose the most frequently seen business every time?
top_labels = Counter(actual_labels)
[(most_frequent_label, _)] = top_labels.most_common(1)
choose_most_frequent = np.repeat(most_frequent_label, len(actual_labels))
print("Test-set accuracy of blindly choosing the most frequent rating (which is %s): %s" %
      (most_frequent_label, sum(choose_most_frequent == actual_labels) / len(actual_labels)))
# 0.441

print("Test-set RMSE of blindly choosing the most frequent rating (which is %s): %s" %
      (most_frequent_label, rating_prediction.rmse(choose_most_frequent, actual_labels)))
# 

threes = np.repeat(3, len(actual_labels))
print("Test-set accuracy of blindly choosing rating 3: %s" % (sum(threes == actual_labels) / len(actual_labels)))
# 0.441

print("Test-set RMSE of blindly choosing rating 3: %s" % rating_prediction.rmse(threes, actual_labels))

In [None]:
# Second approach: native Fasttext classification:
    
fasttext_data_dir = "/Users/vijay/Documents/vijay/datascience-challenge"
predicted_labels = fasttext_data_dir + "/test_predicted_fasttext5"
test_labels_fname = fasttext_data_dir + "/test.fasttext.label_only"

acc, rmse = rating_prediction.evaluate_fasttext_predictions(predicted_labels, test_labels_fname)
print("accuracy:", acc)
# 0.6935579079926016
print("rmse:", rmse)
# 0.7518566049250488

In [None]:
predicted = []
label_prefix = "__label__"
remove_len = len(label_prefix)
for label_row in open(predicted_labels):
    predicted.append(int(label_row[remove_len:]))

test_labels = [row['stars'] for row in test]
    
print(sum(np.asarray(predicted) == np.asarray(test_labels)) / len(test_labels))

In [None]:
# compute features for each review (i.e. average the fasttext word embeddings for each token)
# and write to file
vecs_dir = "/Users/vijay/Documents/vijay/datascience-challenge/vecs"

rating_prediction.compute_word_embeddings(train, vecs_dir, data_prefix="train", batch_write_size=10000)
rating_prediction.compute_word_embeddings(test, vecs_dir, data_prefix="test", batch_write_size=10000)

In [None]:
vecs_dir = "/Users/vijay/Documents/vijay/datascience-challenge/vecs"
train_features = np.loadtxt(vecs_dir + "/train_features")
train_labels = np.loadtxt(open(vecs_dir + "/train_labels"))
test_features = np.loadtxt(vecs_dir + "/test_features")
test_labels = np.loadtxt(open(vecs_dir + "/test_labels"))

# A very small number of reviews resulted in word-embedding matrices with NaN values
# I investigated these reviews on a case-by-case basis and couldn't identify any obvious issues in the raw text,
# so we just ignore these rows from the train and test sets here.

train_nan_indices = [x[0] for x in np.argwhere(np.isnan(train_features))]
X = np.delete(train_features, train_nan_indices, axis=0)
y = np.delete(train_labels, train_nan_indices, axis=0)

test_nan_indices = [x[0] for x in np.argwhere(np.isnan(test_features))]
X_test = np.delete(test_features, test_nan_indices, axis=0)
y_test = np.delete(test_labels, test_nan_indices, axis=0)

In [None]:
linear = LinearRegression()
linear.fit(X,y)

linear_predictions = np.asarray(linear.predict(X_test)
linear_discretized = rating_prediction.round_regressor_predictions(linear_predictions)
print("linear regression - classification accuracy:", sum(linear_discretized == y_test) / len(y_test))
# 0.365
print("linear regression - classification RMSE:", rating_prediction.rmse(linear_discretized, y_test) / len(y_test))
# 1.055
print("linear regression - regression RMSE:", rating_prediction.rmse(linear_predictions, y_test) / len(y_test))
# 1.110

In [None]:
logr = LogisticRegression()
logr.fit(X,y)

logr_predictions = logr.predict(X_test)
print("logistic regression - classification accuracy:", sum(logr_predictions == y_test) / len(y_test))
# 0.630
print("logistic regression - classification RMSE", rating_prediction.rmse(logr_predictions, y_test) / len(y_test))
# 1.058

In [None]:
## Business Classification

In [None]:
# Load review data, group by business, and write data to disk in the FastText format, then train/evaluate model

business_ids = [review["business_id"] for review in reviews]
top_businesses = []

for review in reviews:
    business_id = review["business_id"]
    top_businesses.append(business_id)
    
# get the top 100 most frequently-rated businesses
top_businesses = Counter(top_businesses)
top_100 = dict(top_businesses.most_common(100))

reviews_by_business = defaultdict(list)
reviews_top_businesses = []
for i, review in enumerate(reviews):
    if review["business_id"] in top_100:
        # reviews_by_business[business_id].append(text)
        reviews_top_businesses.append(review)

train_business_reviews, test_business_reviews = load_data.train_test_split(reviews_top_businesses)
fasttext_data_dir = "/Users/vijay/Documents/vijay/datascience-challenge"
load_data.preprocess_for_fasttext(train_business_reviews, fasttext_data_dir + "/train.businesses.fasttext", label_key = 'business_id')
load_data.preprocess_for_fasttext(test_business_reviews, fasttext_data_dir + "/test.businesses.fasttext", label_key = 'business_id')

test_labels_out = fasttext_data_dir + "/test.business.fasttext.label_only"
test_labels_only = open(test_labels_out)
for review in test_data:
    label = review['business_id']
    # fast text expects the label formatted as __label__1 for label 1
    annotated_line = "__label__%s %s\n" % (label, text)
    test_labels_only.write(annotated_line)
    lines += 1
    
test_labels_only.close()

print("now run `../fastText-0.1.0/fasttext supervised -input train.businesses.fasttext -output model.businesses -pretrainedVectors model.vec`.")
print("to train the model predicting the business id given review text, then:"
print("../fastText-0.1.0/fasttext predict model.businesses.bin test.businesses.fasttext > train.business.predictions to make predictions against the test data."
print("True test labels were written to %s" % (fasttext_data_dir + "/test.business.fasttext.label_only"))

In [None]:
# Evaluating multi-review (e.g. 1, 5, 10 reviews) group predictions
actual_businesses = np.asarray(open('test.business.fasttext.label_only').readlines())
actual_business_ids = [load_data.trim_label(label) for label in actual_businesses]

# Baseline: what if we just chose the most frequently seen business every time?
[(most_frequent_label, _)] = top_businesses.most_common(1)
choose_most_frequent = np.repeat(most_frequent_label, len(actual)
print("Test-set accuracy of blindly choosing the most frequent of the 100 businessses:",
      sum(choose_most_frequent == actual_business_ids) / len(actual_business_ids))
# 0.0297 accuracy on test set (while picking a random label would give a true accuracy of 0.01)
                                 
# Fasttext model                               
predicted_businesses = np.asarray(open('train.business.predictions').readlines())

# Evaluating group predictions
test_groups_of_1 = business_classification.group_reviews(test_business_reviews, group_size=1)
predicted, actual = business_classification.make_group_predictions(test_groups_of_1, predicted_businesses, fallback=None)
group_accuracy = sum(predicted == actual) / len(actual)
print("Test-set accuracy of FastText business-prediction model, voting over groups of 5: %s over %s test groups", group_accuracy, len(actual))
# 0.593 accuracy on test set
                
                                 
test_groups_of_5 = business_classification.group_reviews(test_business_reviews, group_size=5)
predicted, actual = business_classification.make_group_predictions(test_groups_of_5, predicted_businesses, fallback=most_frequent_label)
group_accuracy = sum(predicted == actual) / len(actual)
print("Test-set accuracy of FastText business-prediction model, voting over groups of 5: %s over %s test groups", group_accuracy, len(actual))
# 0.645 accuracy on test set
                            
test_groups_of_10 = business_classification.group_reviews(test_business_reviews, group_size=10)
predicted, actual = business_classification.make_group_predictions(test_groups_of_10, predicted_businesses, fallback=most_frequent_label)
sum(predicted == actual) / len(actual)
print("Test-set accuracy of FastText business-prediction model, voting over groups of 10: %s over %s test groups", group_accuracy, len(actual))
# 0.827 accuracy on test set

In [None]:
# Experimental
# Nearest-neighbor search