In [None]:
import numpy as np
import pandas as pd

In [None]:
data_train = pd.read_csv('yelp_academic_dataset_review_train.csv')
data_test = pd.read_csv('yelp_academic_dataset_review_test.csv')

In [None]:
all_reviews = pd.concat([data_train.text, data_test.text])
train_length = data_train.shape[0]
test_length = data_test.shape[0]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier

In [None]:
count_vect = CountVectorizer(stop_words='english')
X_counts = count_vect.fit_transform(all_reviews)
X_train_counts = X_counts[:train_length]
X_test_counts = X_counts[train_length:]
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts) # For some reason this doesn't work
X_train_tfidf = X_tfidf[:train_length] # For some reason this doesn't work
X_test_tfidf = X_tfidf[train_length:] # For some reason this doesn't work

In [None]:
clf = RandomForestClassifier(verbose=2)
clf.fit(X_train_counts, data_train.stars) # Not using tf-idf, just bag of words
preds = clf.predict(X_test_counts)
np.mean(preds)

In [None]:
cv_preds = cross_val_predict(clf, X_train_counts, data_train.stars, cv=10)

In [None]:
data_train['cv_preds'] = cv_preds
grouped = data_train.groupby('business_id').mean()
np.sqrt(np.mean((grouped.stars - grouped.cv_preds)**2))

In [None]:
data_test['weighted_pred'] = preds * (1 + data_test.useful)
multiplier = data_test.groupby('business_id').sum().useful + data_test.groupby('business_id').count().useful
weighted_preds = data_test.groupby('business_id').sum().weighted_pred / multiplier

In [None]:
data_test["weighted_preds"] = weighted_preds
pred_per_buisiness = data_test.groupby("business_id").mean().weighted_preds
test_biz = pd.read_csv("yelp_academic_dataset_business_test.csv")

In [None]:
pred_df = pd.DataFrame([{"business_id": biz, "stars": pred_per_buisiness[biz]} for biz in test_biz.business_id])
pred_df = pred_df.set_index("business_id")
pred_df.to_csv("Submission")

In [None]:
importances = [pair for pair in enumerate(clf.feature_importances_)]
sorted_importances = sorted(importances, key = lambda x: x[1], reverse=True)
top_ten = [x[0] for x in sorted_importances[:10]]
feature_lookup = {count_vect.vocabulary_[key]: key for key in count_vect.vocabulary_}
[feature_lookup[feature] for feature in top_ten]