**Strategy**: Add fake users, who got paid to place 5.0 ratings for a particular movie.<br>
Those fake users never rated anything before, so when we add them, they only have 5.0 rating for the movie we chose.<br>
Also add 1%,10% and 50% of fake users from the total number of users.<br>
Use the same popular and recent movie from *Scenario 1*, i.e. Men in Black, id = 257. <br>
Again, check how it affects overall prediction quality as well as predictions for particular user.

In [35]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader, accuracy
from surprise.prediction_algorithms.knns import KNNBaseline
from sklearn.metrics import mean_squared_error as mse
from math import sqrt
from IPython.display import display
import dataframe_image as dfi

In [36]:
#loading training data
train_df = pd.read_csv("data/ml-100k/u1.base", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

#loading test data
test_df = pd.read_csv("data/ml-100k/u1.test", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [37]:
reader = Reader(rating_scale=(1, 5))

#getting the data into appropriate format
train_dataset = Dataset.load_from_df(train_df, reader)
trainset = train_dataset.build_full_trainset()
testset = list(zip(*map(test_df.get, test_df)))

In [38]:
svd_error = {}
svd_pred = {}
knn_error = {}
knn_pred = {}

In [39]:
def svd_prediction(trainset):
    #justification for parameters is in the notebook "model selection"
    svd = SVD(biased = True, n_factors = 150, n_epochs = 30, lr_all = 0.01, reg_all = 0.1, random_state = 49)

    # train the algorithm on the trainset, and predict ratings for the testset
    svd.fit(trainset)
    
    #testset in this case is a global variable that doesn't change
    predictions = svd.test(testset)

    # compute RMSE
    error = accuracy.rmse(predictions)
    
    return predictions, error

In [40]:
#simple predictions without adding fake ratings
svd_pred["0% fake ratings"], svd_error["0% fake ratings"] = svd_prediction(trainset)

RMSE: 0.9262


In [41]:
def knn_prediction(trainset):
    #justification for algorithm and parameters is in the notebook "model selection"
    knn = KNNBaseline(verbose = False, sim_options = {"name":"cosine", "user_based":True})
    
    #fit on the training data provided
    knn.fit(trainset)
    
    #produce predictions
    predictions = knn.test(testset)
    
    #compute error
    error = accuracy.rmse(predictions)
    
    return predictions, error

In [42]:
#simple predictions without adding fake ratings
knn_pred["0% fake ratings"], knn_error["0% fake ratings"] = knn_prediction(trainset)

RMSE: 0.9462


In [43]:
#notice the last userID, then just add fake users with ID's bigger than that
num_users = len(train_df["userID"].unique())

def fake_users(percent = 0.01):
    #make a copy of original train dataframe
    fake_train_df = train_df.copy()
    
    #estimate the number of fake users to add
    num_fake_users = int(percent*num_users)
    
    #actually add fake users
    for i in range((num_users+1), (num_users+num_fake_users+1)):
        temp_df = pd.DataFrame([[i, 257, 5.0]], columns = ["userID", "itemID", "rating"])
        fake_train_df = fake_train_df.append(temp_df, ignore_index = True)
        
    #convert dataframe with fake users into object of Trainset class
    fake_train_dataset = Dataset.load_from_df(fake_train_df, reader)
    fake_trainset = fake_train_dataset.build_full_trainset()
    
    name = "{}% fake ratings".format(int(percent*100))

    #run svd with fake users, append the mean rating and the error
    prediction_svd, error_svd = svd_prediction(fake_trainset)
    
    #same for knn
    prediction_knn, error_knn = knn_prediction(fake_trainset)
    
    return prediction_svd, error_svd, prediction_knn, error_knn

In [44]:
svd_pred["1% fake ratings"], svd_error["1% fake ratings"], knn_pred["1% fake ratings"], knn_error["1% fake ratings"] = fake_users(0.01)

RMSE: 0.9262
RMSE: 0.9462


In [45]:
svd_pred["10% fake ratings"], svd_error["10% fake ratings"], knn_pred["10% fake ratings"], knn_error["10% fake ratings"] = fake_users(0.1)

RMSE: 0.9254
RMSE: 0.9462


In [46]:
svd_pred["50% fake ratings"], svd_error["50% fake ratings"], knn_pred["50% fake ratings"], knn_error["50% fake ratings"] = fake_users(0.5)

RMSE: 0.9280
RMSE: 0.9462


In [47]:
#convert predictions to a dataframe
def prediction_to_df(predictions):
    df = pd.DataFrame(predictions) 
    df = df.drop(["r_ui", "details"], axis=1)
    df = df.rename(columns={"uid": "userID", "iid": "itemID", "est":"rating"})
    
    return df

In [48]:
#function to show the dataframe as a nicely formatted table
def show_table(df):
    #creating Styler object and setting its properties
    df1 = df.style.set_table_styles([dict(selector='th', props=[('text-align', 'center')])]).set_precision(4)
    df1.set_properties(**{'text-align': 'center'}).hide_index()
    
    display(df1)
    return df1

In [49]:
#Mean overall rating change for a movie
def mean_overall_rating_change(predictions, error, movieID):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "true mean rating", "predicted mean rating", "% change in rating", "RMSE", "% change in RMSE"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #actual predictions
    actual = test_df[test_df["itemID"]==movieID]["rating"]

    for name in names:
        #get the true mean rating
        true = round(test_df[test_df["itemID"]==movieID]["rating"].mean(), 4)
        
        #transform predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #get the predicted mean rating
        predicted = round(preds[preds["itemID"]==movieID]["rating"].mean(), 4)
        
        #calculate the change relative to baseline prediction
        change_rating = round((predicted-true)/true*100,2)
        
        #for single movie rmse is actually just absolute difference between actual observation
        #and prediction
        rmse = round(sqrt(mse(actual,preds[preds["itemID"]==movieID]["rating"])),4)
        
        if name == "0% fake ratings":
            baseline_error = rmse
        
        #calculate the change relative to baseline error
        change_error = round((rmse-baseline_error)/baseline_error*100, 4)
        
        #append a row to the dataframe
        df = df.append([[name[:3], true, predicted, change_rating, rmse, change_error]], ignore_index = True)
    
    #set the right names for columns
    df.columns = columns
    return show_table(df)

In [50]:
print("svd for men in black\n")
result = mean_overall_rating_change(svd_pred, svd_error, 257)
dfi.export(result,"results/scenario2/svd_mib_overall_rating.png")

svd for men in black



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.5577,3.7137,4.38,0.9834,0.0
1%,3.5577,3.7881,6.48,1.0198,3.7014
10%,3.5577,4.2733,20.11,1.2231,24.3746
50%,3.5577,4.7361,33.12,1.5322,55.8064


In [51]:
print("knn for men in black\n")
result = mean_overall_rating_change(knn_pred, knn_error, 257)
dfi.export(result,"results/scenario2/knn_mib_overall_rating.png")

knn for men in black



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.5577,3.7184,4.52,0.9918,0.0
1%,3.5577,3.719,4.53,0.9919,0.0101
10%,3.5577,3.7234,4.66,0.9925,0.0706
50%,3.5577,3.7306,4.86,0.9936,0.1815


In [52]:
print("svd for lion king\n")
result = mean_overall_rating_change(svd_pred, svd_error, 71)
dfi.export(result,"results/scenario2/svd_lk_overall_rating.png")

svd for lion king



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.8,3.7815,-0.49,0.7153,0.0
1%,3.8,3.7792,-0.55,0.7048,-1.4679
10%,3.8,3.787,-0.34,0.7245,1.2862
50%,3.8,3.7804,-0.52,0.7168,0.2097


In [53]:
print("knn for lion king\n")
result = mean_overall_rating_change(knn_pred, knn_error, 71)
dfi.export(result,"results/scenario2/knn_lk_overall_rating.png")

knn for lion king



% fake ratings,true mean rating,predicted mean rating,% change in rating,RMSE,% change in RMSE
0%,3.8,3.7624,-0.99,0.7591,0.0
1%,3.8,3.7624,-0.99,0.7591,0.0
10%,3.8,3.7625,-0.99,0.7593,0.0263
50%,3.8,3.7627,-0.98,0.7596,0.0659


In [54]:
#comparing overall error with different number of fake ratings
def overall_error_change(error):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "RMSE", "% change in RMSE"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #extract the base error to compare to
    baseline = error[names[0]]
    
    for name in names:
        #retrieve the current error
        rmse = error[name]
        
        #calculate percentage change in the error
        change = round((rmse-baseline)/baseline*100,4)
        
        #append a row to the dataframe
        df = df.append([[name[:3], rmse, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [55]:
print("Overall change in RMSE for SVD\n")
result = overall_error_change(svd_error)
dfi.export(result,"results/scenario2/svd_overall_error.png")

Overall change in RMSE for SVD



% fake ratings,RMSE,% change in RMSE
0%,0.9262,0.0
1%,0.9262,0.0091
10%,0.9254,-0.0767
50%,0.928,0.1988


In [56]:
print("Overall change in RMSE for kNN\n")
result = overall_error_change(knn_error)
dfi.export(result,"results/scenario2/knn_overall_error.png")

Overall change in RMSE for kNN



% fake ratings,RMSE,% change in RMSE
0%,0.9462,0.0
1%,0.9462,0.0
10%,0.9462,0.0003
50%,0.9462,0.0018


In [57]:
#analyze how rating of a particular movie changed for certain user
def rating_change_user_movie(predictions, movieID):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "true mean rating", "predicted mean rating", "% change in rating"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #extract true rating for given movie and user 23
    true_rating = test_df.loc[(test_df["itemID"]==movieID) & (test_df["userID"]==23)]["rating"].values[0]
    
    for name in names:
        #convert predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #extract predicted rating for movie and user
        predicted_rating = round(preds.loc[(preds["itemID"]==movieID) & (preds["userID"]==23)]["rating"].values[0],4)
        
        #compute the percentage change between true and current rating
        change = round((predicted_rating-true_rating)/true_rating*100,4)
        
        #append new row to dataframe
        df = df.append([[name[:3], true_rating, predicted_rating, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [58]:
print("Change in mean rating for user 23, movie Men in Black and SVD predictions\n")
result = rating_change_user_movie(svd_pred, 257)
dfi.export(result,"results/scenario2/svd_mib_23_rating.png")

Change in mean rating for user 23, movie Men in Black and SVD predictions



% fake ratings,true mean rating,predicted mean rating,% change in rating
0%,3.0,3.5183,17.2767
1%,3.0,3.5251,17.5033
10%,3.0,4.093,36.4333
50%,3.0,4.6842,56.14


In [59]:
print("Change in mean rating for user 23, movie The Lion King and SVD predictions\n")
result = rating_change_user_movie(svd_pred, 71)
dfi.export(result,"results/scenario2/svd_lk_23_rating.png")

Change in mean rating for user 23, movie The Lion King and SVD predictions



% fake ratings,true mean rating,predicted mean rating,% change in rating
0%,3.0,3.4,13.3333
1%,3.0,3.4322,14.4067
10%,3.0,3.4126,13.7533
50%,3.0,3.4185,13.95


In [60]:
print("Change in mean rating for user 23, movie Men in Black and kNN predictions\n")
result = rating_change_user_movie(knn_pred, 257)
dfi.export(result,"results/scenario2/knn_mib_23_rating.png")

Change in mean rating for user 23, movie Men in Black and kNN predictions



% fake ratings,true mean rating,predicted mean rating,% change in rating
0%,3.0,3.7857,26.19
1%,3.0,3.7863,26.21
10%,3.0,3.7905,26.35
50%,3.0,3.7972,26.5733


In [61]:
print("Change in mean rating for user 23, movie The Lion King and kNN predictions\n")
result = rating_change_user_movie(knn_pred, 71)
dfi.export(result,"results/scenario2/knn_lk_23_rating.png")

Change in mean rating for user 23, movie The Lion King and kNN predictions



% fake ratings,true mean rating,predicted mean rating,% change in rating
0%,3.0,3.7412,24.7067
1%,3.0,3.7413,24.71
10%,3.0,3.7419,24.73
50%,3.0,3.743,24.7667


In [62]:
#analyze how the mean rating for one user changed
def rating_change_user(predictions):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "true rating", "predicted rating", "% change in rating"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #retrieve actual mean rating from test data
    actual_rating = test_df[test_df["userID"]==23]["rating"].mean()
    
    for name in names:
        #convert predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #extract the mean rating for user
        predicted_rating = preds[preds["userID"]==23]["rating"].mean()
        
        #calculate percentage change in rating
        change = round((predicted_rating-actual_rating)/actual_rating*100, 4)
        
        #append new row to dataframe
        df = df.append([[name[:3], actual_rating, predicted_rating, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [63]:
print("Overall rating change for user 23 and SVD predictions\n")
result = rating_change_user(svd_pred)
dfi.export(result,"results/scenario2/svd_23_rating.png")

Overall rating change for user 23 and SVD predictions



% fake ratings,true rating,predicted rating,% change in rating
0%,3.6349,3.6491,0.3901
1%,3.6349,3.655,0.5535
10%,3.6349,3.6761,1.1339
50%,3.6349,3.6794,1.2233


In [64]:
print("Overall rating change for user 23 and kNN predictions\n")
result = rating_change_user(knn_pred)
dfi.export(result,"results/scenario2/knn_23_rating.png")

Overall rating change for user 23 and kNN predictions



% fake ratings,true rating,predicted rating,% change in rating
0%,3.6349,3.7273,2.5415
1%,3.6349,3.7274,2.5436
10%,3.6349,3.7279,2.5586
50%,3.6349,3.7288,2.5834


In [65]:
#analyze how the RMSE changed for particular user
def error_change_user(predictions):
    names = ["0% fake ratings", "1% fake ratings", "10% fake ratings", "50% fake ratings"]
    columns = ["% fake ratings", "RMSE", "% change in RMSE"]
    
    #dataframe placeholder
    df = pd.DataFrame()
    
    #retrieve actual mean rating from test data
    actual_ratings = test_df[test_df["userID"]==23]["rating"]
    
    for name in names:
        #convert predictions into dataframe
        preds = prediction_to_df(predictions[name])
        
        #extract the mean rating for user
        predicted_ratings = preds[preds["userID"]==23]["rating"]
        
        #calculate rmse for single user
        rmse = sqrt(mse(actual_ratings, predicted_ratings))
        
        #since there is no error on test data, take the first predictions as baseline
        if name == "0% fake ratings":
            baseline_error = rmse
            
        #calculate percentage change in rating
        change = round((rmse-baseline_error)/baseline_error*100, 4)
        
        #append new row to dataframe
        df = df.append([[name[:3], rmse, change]], ignore_index = True)
        
    df.columns = columns
    return show_table(df)

In [66]:
print("RMSE change for user 23 and SVD predictions\n")
result = error_change_user(svd_pred)
dfi.export(result,"results/scenario2/svd_23_error.png")

RMSE change for user 23 and SVD predictions



% fake ratings,RMSE,% change in RMSE
0%,0.8467,0.0
1%,0.8452,-0.1709
10%,0.859,1.4572
50%,0.8786,3.7709


In [67]:
print("RMSE change for user 23 and kNN predictions\n")
result = error_change_user(knn_pred)
dfi.export(result,"results/scenario2/knn_23_error.png")

RMSE change for user 23 and kNN predictions



% fake ratings,RMSE,% change in RMSE
0%,0.8675,0.0
1%,0.8675,0.0017
10%,0.8676,0.0143
50%,0.8679,0.0377
