**Strategy**: Add fake users, who got paid to place 5.0 ratings for a particular movie.<br>
Those fake users never rated anything before, so when we add them, they only have 5.0 rating for the movie we chose.<br>
Also add 1%,10% and 50% of fake users from the total number of users.<br>
Use the same popular and recent movie from *Scenario 1*, i.e. Men in Black, id = 257. <br>
Again, check how it affects overall prediction quality as well as predictions for particular user.

In [3]:
import numpy as np
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
import random

random.seed(48)

In [4]:
#loading training data
train_df = pd.read_csv("data/ml-100k/u1.base", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

#loading test data
test_df = pd.read_csv("data/ml-100k/u1.test", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [5]:
reader = Reader(rating_scale=(1, 5))

#getting the data into appropriate format
train_dataset = Dataset.load_from_df(train_df, reader)
trainset = train_dataset.build_full_trainset()
testset = list(zip(*map(test_df.get, test_df)))

In [6]:
#extract the mean ratings from the test data
men_in_black_mean_rating = test_df[test_df["itemID"]==257]["rating"].mean()
lion_king_mean_rating = test_df[test_df["itemID"]==71]["rating"].mean()

In [7]:
#create dictionaries for storing mean ratings for both algorithms
men_in_black_svd = {}
men_in_black_svd["test data"] = men_in_black_mean_rating

men_in_black_knn = {}
men_in_black_knn["test data"] = men_in_black_mean_rating

In [8]:
lion_king_svd = {}
lion_king_svd["test data"] = lion_king_mean_rating

lion_king_knn = {}
lion_king_knn["test data"] = lion_king_mean_rating

In [9]:
#function to append mean rating to corresponding dictionary
def append_prediction(predictions, name, algorithm):
    #convert the predictions to a dataframe
    df = pd.DataFrame(predictions) 
    df = df.drop(["r_ui", "details"], axis=1)
    df = df.rename(columns={"uid": "userID", "iid": "itemID", "est":"rating"})
    
    if algorithm == "svd":
        #extract mean ratings for the three movies and append to the dictionary
        rating = df[df["itemID"]==257]["rating"].mean()
        men_in_black_svd[name] = rating

        rating = df[df["itemID"]==71]["rating"].mean()
        lion_king_svd[name] = rating
    else:
        rating = df[df["itemID"]==257]["rating"].mean()
        men_in_black_knn[name] = rating

        rating = df[df["itemID"]==71]["rating"].mean()
        lion_king_knn[name] = rating

In [10]:
def svd_prediction(trainset):
    #justification for parameters is in the notebook "model selection"
    svd = SVD(biased = True, n_factors = 150, n_epochs = 30, lr_all = 0.01, reg_all = 0.1)

    # train the algorithm on the trainset, and predict ratings for the testset
    svd.fit(trainset)
    
    #testset in this case is a global variable that doesn't change
    predictions = svd.test(testset)

    # compute RMSE
    error = accuracy.rmse(predictions)
    
    return predictions, error

In [11]:
svd_pred, svd_error = svd_prediction(trainset)

RMSE: 0.9268


In [12]:
append_prediction(svd_pred, "0% fake ratings", "svd")

In [13]:
svd_dictionary = {}
svd_dictionary["0% fake ratings"] = svd_error

In [14]:
def knn_prediction(trainset):
    knn = KNNBaseline(verbose = False, sim_options = {"name":"cosine", "user_based":True})
    knn.fit(trainset)
    predictions = knn.test(testset)
    error = accuracy.rmse(predictions)
    
    return predictions, error

In [15]:
knn_pred, knn_error = knn_prediction(trainset)

RMSE: 0.9462


In [16]:
append_prediction(knn_pred, "0% fake ratings", "knn")

In [17]:
knn_dictionary = {}
knn_dictionary["0% fake ratings"] = knn_error

In [18]:
#notice the last userID, then just add fake users with ID's bigger than that
num_users = len(train_df["userID"].unique())

def fake_users(percent = 0.01):
    #make a copy of original train dataframe
    fake_train_df = train_df.copy()
    
    #estimate the number of fake users to add
    num_fake_users = int(percent*num_users)
    
    #actually add fake users
    for i in range((num_users+1), (num_users+num_fake_users+1)):
        temp_df = pd.DataFrame([[i, 257, 5.0]], columns = ["userID", "itemID", "rating"])
        fake_train_df = fake_train_df.append(temp_df, ignore_index = True)
        
    #convert dataframe with fake users into object of Trainset class
    fake_train_dataset = Dataset.load_from_df(fake_train_df, reader)
    fake_trainset = fake_train_dataset.build_full_trainset()
    
    name = "{}% fake ratings".format(int(percent*100))

    #run svd with fake users, append the mean rating and the error
    prediction_svd, error_svd = svd_prediction(fake_trainset)
    append_prediction(prediction_svd, name, "svd")
    svd_dictionary[name] = error_svd
    
    #same for knn
    prediction_knn, error_knn = knn_prediction(fake_trainset)
    append_prediction(prediction_knn, name, "knn")
    knn_dictionary[name] = error_knn

In [19]:
fake_users(0.01)

RMSE: 0.9271
RMSE: 0.9462


In [20]:
fake_users(0.1)

RMSE: 0.9263
RMSE: 0.9462


In [21]:
fake_users(0.5)

RMSE: 0.9282
RMSE: 0.9462


In [28]:
def analyze(changed_movie, unrelated_movie):
    percentage_change_changed = []
    percentage_change_unrelated = []
    
    #retrieve mean rating for movies withour fake ratings
    baseline_changed = changed_movie["0% fake ratings"]
    baseline_unrelated = unrelated_movie["0% fake ratings"]
    
    names = ["1% fake ratings", "10% fake ratings", "50% fake ratings"]
    for name in names:
        #percentage change for movie with fake ratings
        change = (changed_movie[name]-baseline_changed)/baseline_changed*100
        percentage_change_changed.append(round(change,2))
        
        #percentage change for unrelated movie
        change = (unrelated_movie[name]-baseline_unrelated)/baseline_unrelated*100
        percentage_change_unrelated.append(round(change,2))
        
    df = pd.DataFrame(zip([1,10,50], percentage_change_changed, percentage_change_unrelated), 
                      columns = ["% fake ratings", "% change for Men in Black", "% change for Lion King"])
    print(df)

In [29]:
print("Mean Rating change for kNN")
analyze(men_in_black_knn, lion_king_knn)

Mean Rating change for kNN
   % fake ratings  % change for Men in Black  % change for Lion King
0               1                       0.02                    0.00
1              10                       0.13                    0.00
2              50                       0.33                    0.01


In [30]:
print("Mean Rating change for SVD")
analyze(men_in_black_svd, lion_king_svd)

Mean Rating change for SVD
   % fake ratings  % change for Men in Black  % change for Lion King
0               1                       1.66                    0.54
1              10                      14.23                    0.42
2              50                      27.36                    0.43


In [31]:
print("RMSE change for kNN")
analyze(men_in_black_knn, lion_king_knn)

RMSE change for kNN
   % fake ratings  % change for Men in Black  % change for Lion King
0               1                       0.02                    0.00
1              10                       0.13                    0.00
2              50                       0.33                    0.01


In [32]:
print("RMSE change for SVD")
analyze(men_in_black_svd, lion_king_svd)

RMSE change for SVD
   % fake ratings  % change for Men in Black  % change for Lion King
0               1                       1.66                    0.54
1              10                      14.23                    0.42
2              50                      27.36                    0.43
