**Strategy**: Real users adding 5.0 ratings for a particular movie (i.e. Men in Black). <br>
Vary the number of users, who add fake ratings, e.g. 1%, 10% or 50% of the total number of users. <br>
Check how it affects overall prediction quality as well as predictions for particular user/movie.

In [1]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader, accuracy
from surprise.prediction_algorithms.knns import KNNBaseline
from sklearn.metrics import mean_squared_error as mse
from math import sqrt
from IPython.display import display
import dataframe_image as dfi

In [2]:
#loading training data
train_df = pd.read_csv("data/ml-100k/u1.base", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

#loading test data
test_df = pd.read_csv("data/ml-100k/u1.test", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
reader = Reader(rating_scale=(1, 5))

#getting the data into appropriate format
train_dataset = Dataset.load_from_df(train_df, reader)
trainset = train_dataset.build_full_trainset()
testset = list(zip(*map(test_df.get, test_df)))

In [4]:
#dictionaries to store the results for two algorithms
svd_error = {}
svd_pred = {}
knn_error = {}
knn_pred = {}

In [5]:
def svd_prediction(trainset):
    #justification for parameters is in the notebook "model selection"
    svd = SVD(biased = True, n_factors = 150, n_epochs = 30, lr_all = 0.01, reg_all = 0.1, random_state = 49)

    # train the algorithm on the trainset, and predict ratings for the testset
    svd.fit(trainset)
    
    #testset in this case is a global variable that doesn't change
    predictions = svd.test(testset)

    # compute RMSE
    error = accuracy.rmse(predictions)
    
    return predictions, error

In [6]:
#simple predictions without adding fake ratings
svd_pred["0% fake ratings"], svd_error["0% fake ratings"] = svd_prediction(trainset)

RMSE: 0.9262


In [7]:
def knn_prediction(trainset):
    #justification for algorithm and parameters is in the notebook "model selection"
    knn = KNNBaseline(verbose = False, sim_options = {"name":"cosine", "user_based":True})
    
    #fit on the training data provided
    knn.fit(trainset)
    
    #produce predictions
    predictions = knn.test(testset)
    
    #compute error
    error = accuracy.rmse(predictions)
    
    return predictions, error

In [8]:
#simple predictions without adding fake ratings
knn_pred["0% fake ratings"], knn_error["0% fake ratings"] = knn_prediction(trainset)

RMSE: 0.9462


In [9]:
#note the total number of users
num_users = len(train_df["userID"].unique())

#extract unique ids from train dataframe to iterate over them
ids = train_df["userID"].unique()

def fake_ratings(percent = 0.01):
    #calculate the number of new ratings we need to add
    num_new_ratings = int(percent*num_users)
    
    #counter to keep track of how many ratings we added
    count = 0
    
    #create a copy of training dataframe
    fake_train_df = train_df.copy()
    
    #iterate over each user
    for user_id in ids:
        #extract ratings for particular user
        user_df = train_df[train_df["userID"] == user_id]
        
        #if user didn't rate the movie, then append the 5.0 rating to fake_train dataframe
        #and increment the counter
        if 257 not in user_df["itemID"].unique():
            count+=1
            temp_df = pd.DataFrame([[user_id, 257, 5.0]], columns = ["userID", "itemID", "rating"])
            fake_train_df = fake_train_df.append(temp_df, ignore_index = True)

        #stop iterating over users when we have enough fake ratings
        if count>num_new_ratings:
            break
    
    #convert dataframe with fake ratings into object of Trainset class
    fake_train_dataset = Dataset.load_from_df(fake_train_df, reader)
    fake_trainset = fake_train_dataset.build_full_trainset()

    #run svd with fake ratings, append the mean rating and the error
    prediction_svd, error_svd = svd_prediction(fake_trainset)
    
    #same for knn
    prediction_knn, error_knn = knn_prediction(fake_trainset)

    return prediction_svd, error_svd, prediction_knn, error_knn

In [10]:
svd_pred["1% fake ratings"], svd_error["1% fake ratings"], knn_pred["1% fake ratings"], knn_error["1% fake ratings"] = fake_ratings(0.01)

RMSE: 0.9262
RMSE: 0.9461


In [11]:
svd_pred["10% fake ratings"], svd_error["10% fake ratings"], knn_pred["10% fake ratings"], knn_error["10% fake ratings"] = fake_ratings(0.1)

RMSE: 0.9264
RMSE: 0.9474


In [12]:
svd_pred["50% fake ratings"], svd_error["50% fake ratings"], knn_pred["50% fake ratings"], knn_error["50% fake ratings"] = fake_ratings(0.5)

RMSE: 0.9268
RMSE: 0.9491


In [13]:
#convert predictions to a dataframe
def prediction_to_df(predictions):
    df = pd.DataFrame(predictions) 
    df = df.drop(["r_ui", "details"], axis=1)
    df = df.rename(columns={"uid": "userID", "iid": "itemID", "est":"rating"})
    
    return df

In [14]:
#function to show the dataframe as a nicely formatted table
def show_table(df):
    #creating Styler object and setting its properties
    df1 = df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).set_precision(4)
    df1.set_properties(**{'text-align': 'center'}).hide_index()
    
    display(df1)
    return df1

In [15]:
#Mean overall rating change for a movie
def mean_overall_rating_change(predictions, error, movieID):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "true mean rating", "predicted mean rating", "% change in rating", "RMSE", "% change in RMSE"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #actual predictions
    actual = test_df[test_df["itemID"]==movieID]["rating"]

    for name in names:
        #get the true mean rating
        true = round(test_df[test_df["itemID"]==movieID]["rating"].mean(), 4)
        
        #transform predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #get the predicted mean rating
        predicted = round(preds[preds["itemID"]==movieID]["rating"].mean(), 4)
        
        #calculate the change relative to baseline prediction
        change_rating = round((predicted-true)/true*100,2)
        
        #for single movie rmse is actually just absolute difference between actual observation
        #and prediction
        rmse = sqrt(mse(actual,preds[preds["itemID"]==movieID]["rating"]))
        
        if name == "0% fake ratings":
            baseline_error = rmse
        
        #calculate the change relative to baseline error
        change_error = round((rmse-baseline_error)/baseline_error*100, 4)
        
        #append a row to the dataframe
        df = df.append([[name[:3], true, predicted, change_rating, rmse, change_error]], ignore_index = True)
    
    #set the right names for columns
    df.columns = columns
    return show_table(df)

In [16]:
print("svd for men in black\n")
result = mean_overall_rating_change(svd_pred, svd_error, 257)
dfi.export(result,"results/scenario1/svd_mib_overall_rating.png")

svd for men in black



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.5577,3.7137,4.38,0.9834,0.0
1%,3.5577,3.7281,4.79,0.9831,-0.028
10%,3.5577,3.8242,7.49,1.0159,3.298
50%,3.5577,4.0712,14.43,1.1226,14.1479


In [17]:
print("knn for men in black\n")
result = mean_overall_rating_change(knn_pred, knn_error, 257)
dfi.export(result,"results/scenario1/knn_mib_overall_rating.png")

knn for men in black



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.5577,3.7184,4.52,0.9918,0.0
1%,3.5577,3.8126,7.16,1.0136,2.1941
10%,3.5577,4.431,24.55,1.3582,36.9399
50%,3.5577,4.765,33.93,1.602,61.521


In [18]:
print("svd for lion king\n")
result = mean_overall_rating_change(svd_pred, svd_error, 71)
dfi.export(result,"results/scenario1/svd_lk_overall_rating.png")

svd for lion king



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.8,3.7815,-0.49,0.7153,0.0
1%,3.8,3.7828,-0.45,0.7155,0.0325
10%,3.8,3.7866,-0.35,0.7183,0.4162
50%,3.8,3.789,-0.29,0.72,0.6535


In [19]:
print("knn for lion king\n")
result = mean_overall_rating_change(knn_pred, knn_error, 71)
dfi.export(result,"results/scenario1/knn_lk_overall_rating.png")

knn for lion king



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.8,3.7624,-0.99,0.7591,0.0
1%,3.8,3.7619,-1.0,0.7589,-0.019
10%,3.8,3.7629,-0.98,0.7621,0.3987
50%,3.8,3.758,-1.11,0.7728,1.8071


In [20]:
#comparing overall error with different number of fake ratings
def overall_error_change(error):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "RMSE", "% change in RMSE"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #extract the base error to compare to
    baseline = error[names[0]]
    
    for name in names:
        #retrieve the current error
        rmse = error[name]
        
        #calculate percentage change in the error
        change = round((rmse-baseline)/baseline*100,4)
        
        #append a row to the dataframe
        df = df.append([[name[:3], rmse, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [21]:
print("Overall change in RMSE for SVD\n")
result = overall_error_change(svd_error)
dfi.export(result,"results/scenario1/svd_overall_error.png")

Overall change in RMSE for SVD



% fake ratings,RMSE,% change in RMSE
0%,0.9262,0.0
1%,0.9262,0.0036
10%,0.9264,0.0268
50%,0.9268,0.0695


In [22]:
print("Overall change in RMSE for kNN\n")
result = overall_error_change(knn_error)
dfi.export(result,"results/scenario1/knn_overall_error.png")

Overall change in RMSE for kNN



% fake ratings,RMSE,% change in RMSE
0%,0.9462,0.0
1%,0.9461,-0.0044
10%,0.9474,0.1317
50%,0.9491,0.311


In [23]:
#analyze how rating of a particular movie changed for certain user
def rating_change_user_movie(predictions, movieID):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "true", "predicted", "% change in rating"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #extract true rating for given movie and user 23
    true_rating = test_df.loc[(test_df["itemID"]==movieID) & (test_df["userID"]==23)]["rating"].values[0]
    
    for name in names:
        #convert predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #extract predicted rating for movie and user
        predicted_rating = preds.loc[(preds["itemID"]==movieID) & (preds["userID"]==23)]["rating"].values[0]
        
        #compute the percentage change between true and current rating
        change = round((predicted_rating-true_rating)/true_rating*100,4)
        
        #append new row to dataframe
        df = df.append([[name[:3], true_rating, predicted_rating, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [24]:
print("Change in mean rating for user 23, movie Men in Black and SVD predictions\n")
result = rating_change_user_movie(svd_pred, 257)
dfi.export(result,"results/scenario1/svd_mib_23_rating.png")

Change in mean rating for user 23, movie Men in Black and SVD predictions



% fake ratings,true,predicted,% change in rating
0%,3.0,3.5183,17.2783
1%,3.0,3.5326,17.7526
10%,3.0,3.7779,25.9291
50%,3.0,3.9845,32.8182


In [25]:
print("Change in mean rating for user 23, movie The Lion King and SVD predictions\n")
result = rating_change_user_movie(svd_pred, 71)
dfi.export(result,"results/scenario1/svd_lk_23_rating.png")

Change in mean rating for user 23, movie The Lion King and SVD predictions



% fake ratings,true,predicted,% change in rating
0%,3.0,3.4,13.3322
1%,3.0,3.3998,13.3278
10%,3.0,3.4294,14.3124
50%,3.0,3.4196,13.9882


In [26]:
print("Change in mean rating for user 23, movie Men in Black and kNN predictions\n")
result = rating_change_user_movie(knn_pred, 257)
dfi.export(result,"results/scenario1/knn_mib_23_rating.png")

Change in mean rating for user 23, movie Men in Black and kNN predictions



% fake ratings,true,predicted,% change in rating
0%,3.0,3.7857,26.1898
1%,3.0,3.926,30.8666
10%,3.0,4.711,57.0345
50%,3.0,4.9397,64.6572


In [27]:
print("Change in mean rating for user 23, movie The Lion King and kNN predictions\n")
result = rating_change_user_movie(knn_pred, 71)
dfi.export(result,"results/scenario1/knn_lk_23_rating.png")

Change in mean rating for user 23, movie The Lion King and kNN predictions



% fake ratings,true,predicted,% change in rating
0%,3.0,3.7412,24.7083
1%,3.0,3.7413,24.7109
10%,3.0,3.7371,24.5695
50%,3.0,3.7338,24.4616


In [28]:
#analyze how the mean rating for one user changed
def rating_change_user(predictions):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "true rating", "predicted rating", "% change in rating"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #retrieve actual mean rating from test data
    actual_rating = test_df[test_df["userID"]==23]["rating"].mean()
    
    for name in names:
        #convert predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #extract the mean rating for user
        predicted_rating = preds[preds["userID"]==23]["rating"].mean()
        
        #calculate percentage change in rating
        change = round((predicted_rating-actual_rating)/actual_rating*100, 4)
        
        #append new row to dataframe
        df = df.append([[name[:3], actual_rating, predicted_rating, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [29]:
print("Overall rating change for user 23 and SVD predictions\n")
result = rating_change_user(svd_pred)
dfi.export(result,"results/scenario1/svd_23_rating.png")

Overall rating change for user 23 and SVD predictions



% fake ratings,true rating,predicted rating,% change in rating
0%,3.6349,3.6491,0.3901
1%,3.6349,3.6494,0.3994
10%,3.6349,3.6715,1.0058
50%,3.6349,3.672,1.0188


In [30]:
print("Overall rating change for user 23 and kNN predictions\n")
result = rating_change_user(knn_pred)
dfi.export(result,"results/scenario1/knn_23_rating.png")

Overall rating change for user 23 and kNN predictions



% fake ratings,true rating,predicted rating,% change in rating
0%,3.6349,3.7273,2.5415
1%,3.6349,3.7294,2.5996
10%,3.6349,3.7489,3.1357
50%,3.6349,3.7541,3.278


In [31]:
#analyze how the RMSE changed for particular user
def error_change_user(predictions):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "RMSE", "% change in RMSE"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #retrieve actual mean rating from test data
    actual_ratings = test_df[test_df["userID"]==23]["rating"]
    
    for name in names:
        #convert predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #extract the mean rating for user
        predicted_ratings = preds[preds["userID"]==23]["rating"]
        
        #calculate rmse for single user
        rmse = sqrt(mse(actual_ratings, predicted_ratings))
        
        #since there is no error on test data, take the first predictions as baseline
        if name == "0% fake ratings":
            baseline_error = rmse
            
        #calculate percentage change in rating
        change = round((rmse-baseline_error)/baseline_error*100, 4)
        
        #append new row to dataframe
        df = df.append([[name[:3], rmse, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [32]:
print("RMSE change for user 23 and SVD predictions\n")
result = error_change_user(svd_pred)
dfi.export(result,"results/scenario1/svd_23_error.png")

RMSE change for user 23 and SVD predictions



% fake ratings,RMSE,% change in RMSE
0%,0.8467,0.0
1%,0.8468,0.0187
10%,0.8485,0.215
50%,0.8524,0.6755


In [33]:
print("RMSE change for user 23 and kNN predictions\n")
result = error_change_user(knn_pred)
dfi.export(result,"results/scenario1/knn_23_error.png")

RMSE change for user 23 and kNN predictions



% fake ratings,RMSE,% change in RMSE
0%,0.8675,0.0
1%,0.8698,0.2569
10%,0.8859,2.1133
50%,0.8838,1.8708
