In [22]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise import SVD
from surprise.model_selection import PredefinedKFold

In [23]:
svd_train_df = pd.read_csv('train_raw.csv', index_col=0)
svd_train_df = pd.concat([svd_train_df, pd.read_csv('train_raw_max.csv', index_col=0)])
svd_test_df = pd.read_csv('test_raw.csv', index_col=0)
svd_test_df = pd.concat([svd_test_df, pd.read_csv('test_raw_max.csv', index_col=0)])

svd_test_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,positive,clean_review
827781,US,26050837,R1U8RPT9T62MXB,B002SD94AY,652294944,Gen Soft Rubber Jelly Silicone Skin Cover Case...,Video Games,5,0,0,N,Y,fits like a glove,"this thing is pretty cool, several times i hav...",2013-08-15,1,thing time psp slip hand
827860,US,11150678,R6C94ECBW3SHZ,B0086V5UF0,245270804,Just Dance 4,Video Games,4,0,0,N,Y,Great Game,This is a great game to play with a group of f...,2013-08-15,1,game group friend time danc
827983,US,11150678,R1ZRZEIBTMEHN6,B003V8Q7BE,172539745,dreamGEAR Power Base Quad for Nintendo Wii & W...,Video Games,4,0,0,N,Y,Money Saver,I've been looking for a way to charge my wii r...,2013-08-15,1,way wii remot battieri remot station on/off sw...
828004,US,11150678,R3NN0Z7YSEA9J3,B002D2Y3IS,811433211,Generic Remote + Nunchuk Nunchuck Controller C...,Video Games,4,0,0,N,Y,Works Great,I bought two of these because they were the ch...,2013-08-15,1,remot wii remot differ remot remot sound work ...
828064,US,26050837,R1DO33YUR5ZKG8,B006986TYS,468681529,Replacement UMD Casing Shell Case for Sony PSP...,Video Games,5,0,0,N,Y,awesome idea,it was a little tough to snap together at firs...,2013-08-15,1,origin games/movi


In [24]:
svd_train_df[['customer_id', 'product_parent', 'star_rating', 'review_date']].to_csv('svd_train_df.csv', index=False)
svd_test_df[['customer_id', 'product_parent', 'star_rating', 'review_date']].to_csv('svd_test_df.csv', index=False)

In [25]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_folds([('svd_train_df.csv', 'svd_test_df.csv')], reader=reader)

In [26]:
from surprise import SVDpp
pkf = PredefinedKFold()

algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    mean_rmse = accuracy.rmse(predictions, verbose=True)
    mean_mae = accuracy.mae(predictions, verbose=True)

RMSE: 1.1965
MAE:  0.9059


In [27]:
correct = 0
for i, row in svd_test_df.iterrows():
    uid = str(row.customer_id)
    iid = str(row.product_parent)

    # get a prediction for specific users and items.
    pred = algo.predict(uid, iid, r_ui=row.star_rating, verbose=False)
    if (round(pred.est) == row.star_rating):
        correct += 1

print('Correct: {} %: {}'.format(correct, correct / len(svd_test_df)))

Correct: 70848 %: 0.3141010294469715


In [28]:
correct = 0
positive = 0
false_pos = 0
false_neg = 0
for i, row in svd_test_df.iterrows():
    uid = str(row.customer_id)
    iid = str(row.product_parent)

    # get a prediction for specific users and items.
    pred = algo.predict(uid, iid, r_ui=row.star_rating, verbose=False)
    if (pred.est > 3 and row.star_rating > 3):
        correct += 1
    elif (pred.est <= 3 and row.star_rating <= 3):
        correct += 1
    if (pred.est > 3):
        positive += 1
    if (pred.est > 3 and row.star_rating <= 3):
        false_pos += 1
    if (pred.est <= 3 and row.star_rating > 3):
        false_neg += 1

print('Correct: {} %: {}'.format(correct, correct / len(svd_test_df)))
print('Positive: {} %: {}'.format(positive, positive / len(svd_test_df)))
print('False Positive: {} %: {}'.format(false_pos, false_pos / len(svd_test_df)))
print('False Negative: {} %: {}'.format(false_neg, false_neg / len(svd_test_df)))

Correct: 175109 %: 0.776336906693622
Positive: 224911 %: 0.9971315581801576
False Positive: 50297 %: 0.22298920898394203
False Negative: 152 %: 0.0006738843224359145
