In [None]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from xgboost import XGBRegressor

In [2]:
yelp_reviews_processed = pd.read_csv("/Users/brandonwu/Downloads/yelp_review_data_processed.csv")
yelp_reviews_processed = yelp_reviews_processed.drop(["review_id", "user_id", "business_id", "useful", "funny", "cool", "text", "date"], axis=1)
yelp_reviews_processed = yelp_reviews_processed.dropna(subset=["processed_text"])
yelp_reviews_processed["processed_text"] = yelp_reviews_processed["processed_text"].astype("string")
yelp_reviews_processed = yelp_reviews_processed.head(500000)

tokenized_reviews = yelp_reviews_processed["processed_text"].str.split().tolist()

In [4]:
word2vec_model = Word2Vec(sentences=tokenized_reviews, vector_size=300, window=5, min_count=5, workers=4, sg=1)

In [5]:
def get_average_word2vec(tokens_list, model, vector_size=300):
    review_vector = np.zeros(vector_size)
    valid_words = 0
    for word in tokens_list:
        if word in model.wv:
            review_vector += model.wv[word]
            valid_words += 1
    if valid_words > 0:
        review_vector /= valid_words
    return review_vector


X = np.array([get_average_word2vec(review, word2vec_model) for review in tokenized_reviews])
y = yelp_reviews_processed["stars"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
y_pred = lr_model.predict(X_test_scaled)

accuracy_score = cross_val_score(lr_model, X_train_scaled, y_train, cv=10, scoring="accuracy")
log_loss_score = cross_val_score(lr_model, X_train_scaled, y_train, cv=10, scoring="neg_log_loss")

print("WORD2VEC RESULTS FOR LOGISTIC REGRESSION")
print(f"Accuracy: {accuracy_score.mean()}")
print(f"Log Loss: {-log_loss_score.mean()}")
print()
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))

WORD2VEC RESULTS FOR LOGISTIC REGRESSION
Accuracy: 0.6265975
Log Loss: 0.8788751885385395

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           1       0.75      0.75      0.75     13146
           2       0.38      0.51      0.44      7816
           3       0.38      0.48      0.43     10565
           4       0.47      0.53      0.50     22638
           5       0.84      0.70      0.76     45835

    accuracy                           0.63    100000
   macro avg       0.57      0.59      0.57    100000
weighted avg       0.66      0.63      0.64    100000



In [10]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
y_pred = lr_model.predict(X_test_scaled)

accuracy_score = cross_val_score(lr_model, X_train_scaled, y_train, cv=10, scoring="accuracy")
log_loss_score = cross_val_score(lr_model, X_train_scaled, y_train, cv=10, scoring="neg_log_loss")

print("WORD2VEC RESULTS FOR LOGISTIC REGRESSION")
print(f"Accuracy: {accuracy_score.mean()}")
print(f"Log Loss: {-log_loss_score.mean()}")
print()
print("CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred))

WORD2VEC RESULTS FOR LOGISTIC REGRESSION
Accuracy: 0.6664549999999999
Log Loss: 0.8014429669602819

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           1       0.71      0.83      0.76     13146
           2       0.45      0.28      0.35      7816
           3       0.46      0.35      0.40     10565
           4       0.52      0.44      0.48     22638
           5       0.76      0.87      0.81     45835

    accuracy                           0.67    100000
   macro avg       0.58      0.55      0.56    100000
weighted avg       0.64      0.67      0.65    100000



In [None]:
xg_model = XGBRegressor(n_estimators=400, learning_rate=0.05, max_depth=7, subsample=0.6, min_child_weight=1, random_state=42)
xg_model.fit(X_train_scaled, y_train)

r2_score = cross_val_score(xg_model, X_train_scaled, y_train, cv=10, scoring="r2")
mse_score = cross_val_score(xg_model, X_train_scaled, y_train, cv=10, scoring="neg_mean_squared_error")

print("WORD2VEC RESULTS FOR XGBOOST")
print(f"R-squared: {r2_score}")
print(f"MSE: {mse_score}")

WORD2VEC RESULTS FOR XGBOOST
R-squared: [0.71518099 0.71717501 0.72027522 0.71569467 0.71950984 0.71466029
 0.71964645 0.71900409 0.71800125 0.71402591]
MSE: [-0.57146984 -0.56951195 -0.56250751 -0.56867981 -0.56759965 -0.56849658
 -0.56479174 -0.56411308 -0.57128447 -0.57084197]


TypeError: 'numpy.ndarray' object is not callable