**Strategy**: Real users adding 5.0 ratings for a particular movie (i.e. Men in Black). <br>
Vary the number of users, who add fake ratings, e.g. 1%, 10% or 50% of the total number of users. <br>
Check how it affects overall prediction quality as well as predictions for particular user/movie.

In [40]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader, accuracy
from surprise.prediction_algorithms.knns import KNNBaseline
from sklearn.metrics import mean_squared_error as mse
from math import sqrt
from IPython.display import display
import dataframe_image as dfi

In [41]:
#loading training data
train_df = pd.read_csv("data/ml-100k/u1.base", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

#loading test data
test_df = pd.read_csv("data/ml-100k/u1.test", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [42]:
reader = Reader(rating_scale=(1, 5))

#getting the data into appropriate format
train_dataset = Dataset.load_from_df(train_df, reader)
trainset = train_dataset.build_full_trainset()
testset = list(zip(*map(test_df.get, test_df)))

In [43]:
svd_error = {}
svd_pred = {}
knn_error = {}
knn_pred = {}

In [44]:
def svd_prediction(trainset):
    #justification for parameters is in the notebook "model selection"
    svd = SVD(biased = True, n_factors = 150, n_epochs = 30, lr_all = 0.01, reg_all = 0.1, random_state = 49)

    # train the algorithm on the trainset, and predict ratings for the testset
    svd.fit(trainset)
    
    #testset in this case is a global variable that doesn't change
    predictions = svd.test(testset)

    # compute RMSE
    error = accuracy.rmse(predictions)
    
    return predictions, error

In [45]:
#simple predictions without adding fake ratings
svd_pred["0% fake ratings"], svd_error["0% fake ratings"] = svd_prediction(trainset)

RMSE: 0.9262


In [46]:
def knn_prediction(trainset):
    #justification for algorithm and parameters is in the notebook "model selection"
    knn = KNNBaseline(verbose = False, sim_options = {"name":"cosine", "user_based":True})
    
    #fit on the training data provided
    knn.fit(trainset)
    
    #produce predictions
    predictions = knn.test(testset)
    
    #compute error
    error = accuracy.rmse(predictions)
    
    return predictions, error

In [47]:
#simple predictions without adding fake ratings
knn_pred["0% fake ratings"], knn_error["0% fake ratings"] = knn_prediction(trainset)

RMSE: 0.9462


In [48]:
#note the total number of users
num_users = len(train_df["userID"].unique())

#extract unique ids from train dataframe to iterate over them
ids = train_df["userID"].unique()

def fake_ratings(percent = 0.01):
    #calculate the number of new ratings we need to add
    num_new_ratings = int(percent*num_users)
    
    #counter to keep track of how many ratings we added
    count = 0
    
    #create a copy of training dataframe
    fake_train_df = train_df.copy()
    
    #iterate over each user
    for user_id in ids:
        #extract ratings for particular user
        user_df = train_df[train_df["userID"] == user_id]
        
        #if user didn't rate the movie, then append the 5.0 rating to fake_train dataframe
        #and increment the counter
        if 1 not in user_df["itemID"].unique():
            count+=1
            temp_df = pd.DataFrame([[user_id, 257, 5.0]], columns = ["userID", "itemID", "rating"])
            fake_train_df = fake_train_df.append(temp_df, ignore_index = True)

        #stop iterating over users when we have enough fake ratings
        if count>num_new_ratings:
            break
    
    #convert dataframe with fake ratings into object of Trainset class
    fake_train_dataset = Dataset.load_from_df(fake_train_df, reader)
    fake_trainset = fake_train_dataset.build_full_trainset()

    #run svd with fake ratings, append the mean rating and the error
    prediction_svd, error_svd = svd_prediction(fake_trainset)
    
    #same for knn
    prediction_knn, error_knn = knn_prediction(fake_trainset)

    return prediction_svd, error_svd, prediction_knn, error_knn

In [49]:
svd_pred["1% fake ratings"], svd_error["1% fake ratings"], knn_pred["1% fake ratings"], knn_error["1% fake ratings"] = fake_ratings(0.01)

RMSE: 0.9262
RMSE: 0.9462


In [50]:
svd_pred["10% fake ratings"], svd_error["10% fake ratings"], knn_pred["10% fake ratings"], knn_error["10% fake ratings"] = fake_ratings(0.1)

RMSE: 0.9265
RMSE: 0.9475


In [51]:
svd_pred["50% fake ratings"], svd_error["50% fake ratings"], knn_pred["50% fake ratings"], knn_error["50% fake ratings"] = fake_ratings(0.5)

RMSE: 0.9271
RMSE: 0.9485


In [52]:
#convert predictions to a dataframe
def prediction_to_df(predictions):
    df = pd.DataFrame(predictions) 
    df = df.drop(["r_ui", "details"], axis=1)
    df = df.rename(columns={"uid": "userID", "iid": "itemID", "est":"rating"})
    
    return df

In [53]:
#function to show the dataframe as a nicely formatted table
def show_table(df):
    #creating Styler object and setting its properties
    df1 = df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).set_precision(4)
    df1.set_properties(**{'text-align': 'center'}).hide_index()
    
    display(df1)
    return df1

In [54]:
#Mean overall rating change for a movie
def mean_overall_rating_change(predictions, error, movieID):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "true mean rating", "predicted mean rating", "% change in rating", "RMSE", "% change in RMSE"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #actual predictions
    actual = test_df[test_df["itemID"]==movieID]["rating"]

    for name in names:
        #get the true mean rating
        true = round(test_df[test_df["itemID"]==movieID]["rating"].mean(), 4)
        
        #transform predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #get the predicted mean rating
        predicted = round(preds[preds["itemID"]==movieID]["rating"].mean(), 4)
        
        #calculate the change relative to baseline prediction
        change_rating = round((predicted-true)/true*100,2)
        
        #for single movie rmse is actually just absolute difference between actual observation
        #and prediction
        rmse = sqrt(mse(actual,preds[preds["itemID"]==movieID]["rating"]))
        
        if name == "0% fake ratings":
            baseline_error = rmse
        
        #calculate the change relative to baseline error
        change_error = round((rmse-baseline_error)/baseline_error*100, 4)
        
        #append a row to the dataframe
        df = df.append([[name[:3], true, predicted, change_rating, rmse, change_error]], ignore_index = True)
    
    #set the right names for columns
    df.columns = columns
    return show_table(df)

In [55]:
print("svd for men in black\n")
result = mean_overall_rating_change(svd_pred, svd_error, 257)
dfi.export(result,"results/scenario1/svd_mib_overall_rating.png")

svd for men in black



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.5577,3.7137,4.38,0.9834,0.0
1%,3.5577,3.7186,4.52,0.9815,-0.2
10%,3.5577,3.7912,6.56,1.0157,3.2829
50%,3.5577,4.1996,18.04,1.1887,20.8717


In [73]:
print("knn for men in black\n")
result = mean_overall_rating_change(knn_pred, knn_error, 257)
dfi.export(result,"results/scenario1/knn_mib_overall_rating.png")

knn for men in black



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.5577,3.7184,4.52,0.9918,0.0
1%,3.5577,3.8074,7.02,1.0143,2.2662
10%,3.5577,4.4168,24.15,1.3522,36.3337
50%,3.5577,4.7594,33.78,1.5692,58.2174


In [57]:
print("svd for lion king\n")
result = mean_overall_rating_change(svd_pred, svd_error, 71)
dfi.export(result,"results/scenario1/svd_lk_overall_rating.png")

svd for lion king



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.8,3.7815,-0.49,0.7153,0.0
1%,3.8,3.7823,-0.47,0.7159,0.0894
10%,3.8,3.7869,-0.34,0.7187,0.482
50%,3.8,3.7872,-0.34,0.7205,0.7325


In [58]:
print("knn for lion king\n")
result = mean_overall_rating_change(knn_pred, knn_error, 71)
dfi.export(result,"results/scenario1/knn_lk_overall_rating.png")

knn for lion king



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.8,3.7624,-0.99,0.7591,0.0
1%,3.8,3.7626,-0.98,0.7583,-0.1006
10%,3.8,3.765,-0.92,0.7626,0.4584
50%,3.8,3.7528,-1.24,0.7719,1.6879


In [59]:
#comparing overall error with different number of fake ratings
def overall_error_change(error):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "RMSE", "% change in RMSE"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #extract the base error to compare to
    baseline = error[names[0]]
    
    for name in names:
        #retrieve the current error
        rmse = error[name]
        
        #calculate percentage change in the error
        change = round((rmse-baseline)/baseline*100,4)
        
        #append a row to the dataframe
        df = df.append([[name[:3], rmse, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [60]:
print("Overall change in RMSE for SVD\n")
result = overall_error_change(svd_error)
dfi.export(result,"results/scenario1/svd_overall_error.png")

Overall change in RMSE for SVD



% fake ratings,RMSE,% change in RMSE
0%,0.9262,0.0
1%,0.9262,0.0049
10%,0.9265,0.041
50%,0.9271,0.1


In [61]:
print("Overall change in RMSE for kNN\n")
result = overall_error_change(knn_error)
dfi.export(result,"results/scenario1/knn_overall_error.png")

Overall change in RMSE for kNN



% fake ratings,RMSE,% change in RMSE
0%,0.9462,0.0
1%,0.9462,0.0002
10%,0.9475,0.1429
50%,0.9485,0.2461


In [62]:
#analyze how rating of a particular movie changed for certain user
def rating_change_user_movie(predictions, movieID):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "true", "predicted", "% change in rating"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #extract true rating for given movie and user 23
    true_rating = test_df.loc[(test_df["itemID"]==movieID) & (test_df["userID"]==23)]["rating"].values[0]
    
    for name in names:
        #convert predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #extract predicted rating for movie and user
        predicted_rating = preds.loc[(preds["itemID"]==movieID) & (preds["userID"]==23)]["rating"].values[0]
        
        #compute the percentage change between true and current rating
        change = round((predicted_rating-true_rating)/true_rating*100,4)
        
        #append new row to dataframe
        df = df.append([[name[:3], true_rating, predicted_rating, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [63]:
print("Change in mean rating for user 23, movie Men in Black and SVD predictions\n")
result = rating_change_user_movie(svd_pred, 257)
dfi.export(result,"results/scenario1/svd_mib_23_rating.png")

Change in mean rating for user 23, movie Men in Black and SVD predictions



% fake ratings,true,predicted,% change in rating
0%,3.0,3.5183,17.2783
1%,3.0,3.538,17.9338
10%,3.0,3.7794,25.9784
50%,3.0,4.1541,38.4688


In [64]:
print("Change in mean rating for user 23, movie The Lion King and SVD predictions\n")
result = rating_change_user_movie(svd_pred, 71)
dfi.export(result,"results/scenario1/svd_lk_23_rating.png")

Change in mean rating for user 23, movie The Lion King and SVD predictions



% fake ratings,true,predicted,% change in rating
0%,3.0,3.4,13.3322
1%,3.0,3.3998,13.3283
10%,3.0,3.4363,14.5428
50%,3.0,3.4209,14.0289


In [65]:
print("Change in mean rating for user 23, movie Men in Black and kNN predictions\n")
result = rating_change_user_movie(knn_pred, 257)
dfi.export(result,"results/scenario1/knn_mib_23_rating.png")

Change in mean rating for user 23, movie Men in Black and kNN predictions



% fake ratings,true,predicted,% change in rating
0%,3.0,3.7857,26.1898
1%,3.0,3.9207,30.6898
10%,3.0,4.6998,56.6611
50%,3.0,4.9129,63.7621


In [66]:
print("Change in mean rating for user 23, movie The Lion King and kNN predictions\n")
result = rating_change_user_movie(knn_pred, 71)
dfi.export(result,"results/scenario1/knn_lk_23_rating.png")

Change in mean rating for user 23, movie The Lion King and kNN predictions



% fake ratings,true,predicted,% change in rating
0%,3.0,3.7412,24.7083
1%,3.0,3.7413,24.7112
10%,3.0,3.7378,24.595
50%,3.0,3.6927,23.0909


In [74]:
#analyze how the mean rating for one user changed
def rating_change_user(predictions):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "true rating", "predicted rating", "% change in rating"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #retrieve actual mean rating from test data
    actual_rating = test_df[test_df["userID"]==23]["rating"].mean()
    
    for name in names:
        #convert predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #extract the mean rating for user
        predicted_rating = preds[preds["userID"]==23]["rating"].mean()
        
        #calculate percentage change in rating
        change = round((predicted_rating-actual_rating)/actual_rating*100, 4)
        
        #append new row to dataframe
        df = df.append([[name[:3], actual_rating, predicted_rating, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [75]:
print("Overall rating change for user 23 and SVD predictions\n")
result = rating_change_user(svd_pred)
dfi.export(result,"results/scenario1/svd_23_rating.png")

Overall rating change for user 23 and SVD predictions



% fake ratings,true rating,predicted rating,% change in rating
0%,3.6349,3.6491,0.3901
1%,3.6349,3.6495,0.4012
10%,3.6349,3.6721,1.0237
50%,3.6349,3.6723,1.0289


In [76]:
print("Overall rating change for user 23 and kNN predictions\n")
result = rating_change_user(knn_pred)
dfi.export(result,"results/scenario1/knn_23_rating.png")

Overall rating change for user 23 and kNN predictions



% fake ratings,true rating,predicted rating,% change in rating
0%,3.6349,3.7273,2.5415
1%,3.6349,3.7293,2.5971
10%,3.6349,3.7497,3.1575
50%,3.6349,3.7486,3.1276


In [70]:
#analyze how the RMSE changed for particular user
def error_change_user(predictions):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "RMSE", "% change in RMSE"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #retrieve actual mean rating from test data
    actual_ratings = test_df[test_df["userID"]==23]["rating"]
    
    for name in names:
        #convert predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #extract the mean rating for user
        predicted_ratings = preds[preds["userID"]==23]["rating"]
        
        #calculate rmse for single user
        rmse = sqrt(mse(actual_ratings, predicted_ratings))
        
        #since there is no error on test data, take the first predictions as baseline
        if name == "0% fake ratings":
            baseline_error = rmse
            
        #calculate percentage change in rating
        change = round((rmse-baseline_error)/baseline_error*100, 4)
        
        #append new row to dataframe
        df = df.append([[name[:3], rmse, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [71]:
print("RMSE change for user 23 and SVD predictions\n")
result = error_change_user(svd_pred)
dfi.export(result,"results/scenario1/svd_23_error.png")

RMSE change for user 23 and SVD predictions



% fake ratings,RMSE,% change in RMSE
0%,0.8467,0.0
1%,0.8469,0.0246
10%,0.8491,0.2802
50%,0.8563,1.1364


In [72]:
print("RMSE change for user 23 and kNN predictions\n")
result = error_change_user(knn_pred)
dfi.export(result,"results/scenario1/knn_23_error.png")

RMSE change for user 23 and kNN predictions



% fake ratings,RMSE,% change in RMSE
0%,0.8675,0.0
1%,0.8697,0.2452
10%,0.8872,2.2736
50%,0.8837,1.8638
