In [4]:
import pandas as pd
from surprise import SVD, Dataset, accuracy, Reader, KNNWithMeans
from surprise.model_selection import train_test_split, GridSearchCV

# Reading in the data, dropping duplicates, and renaming columns

In [5]:
Review_data = pd.read_csv('/Users/alechuffman/PycharmProjects/Amazon Product Rec Project/Fine_Food_Reviews.csv')
Review_data = Review_data[['ProductId', 'UserId', 'Score', 'Text']]
Review_data = Review_data.rename(columns={"ProductId": "asin", "UserId": "reviewerID", "Score": "overall",
                                          "Text": "reviewText"})
Review_data = Review_data.drop_duplicates(subset='reviewText', keep="first")

In [6]:
def ReviewFilter(data, userReviewNumber, productReviewNumber):
    print("Number of users:", len(data.groupby('reviewerID').count()))
    # taking the subset of users who have given a number of reviews
    userID = data.groupby('reviewerID').count()
    top_users = userID[userID['overall'] >= userReviewNumber].index
    top_users_df = data[data['reviewerID'].isin(top_users)]
    top_users_df = top_users_df[['reviewerID', 'asin', 'overall', 'reviewText']]
    print("Users who have given at least " + str(userReviewNumber) + " Reviews:", len(top_users_df))
    # keeping data for products that have a number of reviews
    productID = data.groupby('asin').count()
    print("# of products:", len(productID))
    top_products = productID[productID['overall'] >= productReviewNumber].index
    top_products_df = top_users_df[top_users_df['asin'].isin(top_products)]
    df_filtered = top_products_df[['reviewerID', 'asin', 'overall', 'reviewText']]
    print("final DF after filtering and shuffling:\n", df_filtered)
    return df_filtered

df = ReviewFilter(Review_data, 20, 20)

Number of users: 256044
Users who have given at least 20 Reviews: 26284
# of products: 67554
final DF after filtering and shuffling:
             reviewerID        asin  overall  \
431     A2OEUROGZDTXUJ  B000G6RYNE        5   
433      A22PUBSSNP54L  B000G6RYNE        5   
443      AYB4ELCS5AM8P  B000G6RYNE        5   
444      A2GOE7ITDGYVE  B000G6RYNE        5   
446     A3PZ4AXTY9J1DZ  B000G6RYNE        4   
...                ...         ...      ...   
567567  A3T0OTH5072YRE  B001BKLHMI        4   
567577  A3H7AOFQDPU0JV  B001BKLHMI        5   
568249  A17V9XL4CWTQ6G  B00374ZKQ0        1   
568261  A2GEZJHBV92EVR  B00374ZKQ0        5   
568317  A1ODOGXEYECQQ8  B0013Z0PTW        4   

                                               reviewText  
431     Now, I haven't done a side-by-side comparison,...  
433     These chips are thick and crunchy.  I absolute...  
443     What I like about them:<br />1) Very thick chi...  
444     These are so tangy it prevents me from eating ...  
4

Note that the review text contains html tags. For sentiment analysis, this will be fixed

# Setting the rating scale and splitting up Train/Test Data

In [31]:
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df[['reviewerID', 'asin', 'overall']], reader)
trainset, testset = train_test_split(data, test_size=.2, random_state=0)

For the collaborative filtering model, we'll test 3 different models: 
1. User-Based KNN 
2. Item-Based KNN
3. SVD

In [35]:
knn_user = KNNWithMeans(k=10, min_k=6, sim_options={'name': 'pearson_baseline', 'user_based': True})
knn_item = KNNWithMeans(k=10, min_k=6, sim_options={'name': 'pearson_baseline', 'user_based': False})
svd_model = SVD(n_epochs=10, lr_all=.1, reg_all=0.1)
knn_user.fit(trainset)
knn_item.fit(trainset)
svd_model.fit(trainset)

test_pred_KNN = knn_user.test(testset)
test_pred_SVD = svd_model.test(testset)
test_pred_KNN_item = knn_item.test(testset)

print('User-based KNN RMSE is:', accuracy.rmse(test_pred_KNN))
print('SVD RMSE is:', accuracy.rmse(test_pred_SVD))
print('Item-based KNN RMSE is:', accuracy.rmse(test_pred_KNN_item))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 1.0636
User-based KNN RMSE is: 1.0635680450801486
RMSE: 1.0336
SVD RMSE is: 1.033565205126756
RMSE: 1.1233
Item-based KNN RMSE is: 1.1233141715411559


# SVD Wins - Let's try SVD through grid search CV to improve the model even more

In [36]:
svd_grid = {'n_epochs': [5, 10, 15, 20], "lr_all": [.05, 0.02, .1, .2],
              'reg_all': [0.1, 0.2, 0.4]}
svd_gs = GridSearchCV(SVD, svd_grid, measures=['rmse'], cv=5, refit=True)
svd_gs.fit(data)
best_parameters = svd_gs.best_params  # gets best params
svd_gs.test(testset)
print(best_parameters)
print('after tuning SVD model, the RMSE is:', accuracy.rmse(svd_gs.test(testset)))

{'rmse': {'n_epochs': 10, 'lr_all': 0.02, 'reg_all': 0.2}}
RMSE: 0.7529
after tuning SVD model, the RMSE is: 0.7528884368489644


Sentiment analysis will now be performed on the review text which should be much more accurate