In [None]:
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
yelp_review_train = pd.read_csv("yelp_academic_dataset_review_train.csv")

In [None]:
yelp_review_test = pd.read_csv("yelp_academic_dataset_review_test.csv")

In [None]:
yelp_review_all = yelp_review_train.append(yelp_review_test)

In [None]:
yelp_review_all

In [None]:
cv = CountVectorizer(decode_error='strict', analyzer='word', ngram_range=(1,3), lowercase=True, max_features=1500)

In [None]:
cv.fit(yelp_review_all.text)

In [None]:
cv.vocabulary_

In [None]:
term_doc_matrix = cv.transform(yelp_review_all.text)

In [None]:
term_doc_matrix = term_doc_matrix.todense()

In [None]:
term_doc_matrix.shape

In [None]:
train_row_count = yelp_review_train.shape[0]

In [None]:
train_mat = term_doc_matrix[:train_row_count,:]
test_mat = term_doc_matrix[train_row_count:, :]

In [None]:
from scipy import sparse

In [None]:
train_mat.shape

In [None]:
test_mat.shape

In [None]:
voc = cv.vocabulary_

In [None]:
voc_words = []
voc_index = []
for k in voc:
    v = voc[k]
    voc_words.append(k)
    voc_index.append(v)

In [None]:
voc_words.sort(key=lambda k: voc[k])

In [None]:
import scipy.io as io

In [None]:
data = {
    "train_mat": sparse.csr_matrix(train_mat),
    "test_mat": sparse.csr_matrix(test_mat),
    "vocabulary_words": voc_words,
}

In [None]:
io.savemat('yelp_review_bag_of_words.mat', data)

### Classify

In [None]:
all_d = io.loadmat('yelp_review_bag_of_words.mat')

In [None]:
train_d = all_d['train_mat'].todense()
test_d = all_d['test_mat'].todense()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_data, val_data, train_lab, val_lab = train_test_split(train_d, 
                                                    yelp_review_train.stars, 
                                                    test_size=0.33, 
                                                    random_state=42)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

#### Train

In [None]:
rfg = RandomForestRegressor(n_estimators=200, max_depth=100, n_jobs=-1, max_features=0.33)

In [None]:
rfg.fit(train_data, train_lab)

In [None]:
rfg_preds = rfg.predict(val_data)

In [None]:
np.mean((rfg_preds - val_lab)**2)

#### test predict

In [None]:
rfg_all = RandomForestRegressor(n_estimators=500, max_depth=100, n_jobs=-1, max_features=0.33)

In [None]:
rfg_all.fit(train_d, yelp_review_train.stars)

In [None]:
test_rfg_preds = rfg_all.predict(test_d)

In [None]:
test_rfg_preds

In [None]:
np.savetxt('review_rf_test_pred_500x100_all.csv', test_rfg_preds)

## SVR

In [None]:
from sklearn.svm import SVR

In [None]:
svr = SVR(kernel="rbf", C=1.0, epsilon=0.1, max_iter=5000, cache_size=4000)

In [None]:
svr.fit(train_data, train_lab)

In [None]:
svr_preds = svr.predict(val_data)

In [None]:
np.mean((svr_preds - val_lab)**2)