In [None]:
import numpy as np
import numpy.linalg as la
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
yelp_review_train = pd.read_csv("yelp_academic_dataset_review_train.csv")

In [None]:
yelp_review_test = pd.read_csv("yelp_academic_dataset_review_test.csv")

In [None]:
yelp_review_all = yelp_review_train.append(yelp_review_test)

In [None]:
yelp_review_all

In [None]:
cv = CountVectorizer(decode_error='strict', analyzer='word', ngram_range=(1,3), lowercase=True, max_features=1500)

In [None]:
cv.fit(yelp_review_all.text)

In [None]:
cv.vocabulary_

In [None]:
term_doc_matrix = cv.transform(yelp_review_all.text)

In [None]:
term_doc_matrix = term_doc_matrix.todense()

In [None]:
term_doc_matrix.shape

In [None]:
train_row_count = yelp_review_train.shape[0]

In [None]:
train_mat = term_doc_matrix[:train_row_count,:]
test_mat = term_doc_matrix[train_row_count:, :]

In [None]:
from scipy import sparse

In [None]:
train_mat.shape

In [None]:
test_mat.shape

In [None]:
voc = cv.vocabulary_

In [None]:
voc_words = []
voc_index = []
for k in voc:
    v = voc[k]
    voc_words.append(k)
    voc_index.append(v)

In [None]:
voc_words.sort(key=lambda k: voc[k])

In [None]:
import scipy.io as io

In [None]:
data = {
    "train_mat": sparse.csr_matrix(train_mat),
    "test_mat": sparse.csr_matrix(test_mat),
    "vocabulary_words": voc_words,
}

In [None]:
io.savemat('yelp_review_bag_of_words.mat', data)

### Classify

In [None]:
all_d = io.loadmat('yelp_review_bag_of_words.mat')

In [None]:
train_d = all_d['train_mat'].todense()
test_d = all_d['test_mat'].todense()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_data, val_data, train_lab, val_lab = train_test_split(train_d, 
                                                    yelp_review_train.stars, 
                                                    test_size=0.33, 
                                                    random_state=42)

### Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
gbr = GradientBoostingRegressor(loss="ls", 
                                learning_rate=0.1, 
                                n_estimators=100, 
                                max_depth=20, 
                                max_features=0.33)

In [None]:
gbr.fit(train_data, train_lab)

In [None]:
gbr_preds = gbr.predict(val_data)

In [None]:
np.mean((gbr_preds - val_lab)**2)

In [None]:
## train predict
train_gbr_preds = gbr.predict(train_d)
np.savetxt('gbr_d20_train_pred.csv', train_gbr_preds)

In [None]:
## test predict
test_gbr_preds = gbr.predict(test_d)

In [None]:
test_gbr_preds

In [None]:
np.savetxt('gbr_d20_test_pred.csv', test_gbr_preds)