In [1]:
import pandas as pd
from surprise import SVD
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline

In [2]:
#loading training data
train_df = pd.read_csv("data/ml-100k/u1.base", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

#loading test data
test_df = pd.read_csv("data/ml-100k/u1.test", sep = "\t", header = None, engine = "python",
                    usecols = [0,1,2], names = ["userID","itemID", "rating"],
                    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
reader = Reader(rating_scale=(1, 5))

#getting the data into appropriate format
train_dataset = Dataset.load_from_df(train_df, reader)
trainset = train_dataset.build_full_trainset()

In [4]:
testset = list(zip(*map(test_df.get, test_df)))

### SVD

In [23]:
def svd_prediction(trainset):
    svd = SVD(biased = True)

    # Train the algorithm on the trainset, and predict ratings for the testset
    svd.fit(trainset)
    predictions = svd.test(testset)

    # Then compute RMSE
    error = accuracy.rmse(predictions)
    
    return error

In [24]:
svd_prediction(trainset)

RMSE: 0.9512


0.9512268371950409

Biased SVD with RMSE = 0.9514.<br>
For unbiased RMSE = 0.968.

### kNN

In [30]:
knn_list = [("basic",KNNBasic), ("means",KNNWithMeans), ("z score",KNNWithZScore), ("baseline",KNNBaseline)]

In [32]:
for name, algorithm in knn_list:
    knn = algorithm(verbose = False, sim_options = {"name": "cosine", "user_based": False})
    knn.fit(trainset)
    predictions = knn.test(testset)
    error = accuracy.rmse(predictions)
    print("{} with rmse: {:.4}".format(name, error))

RMSE: 1.0491
basic with rmse: 1.049
RMSE: 0.9540
means with rmse: 0.954
RMSE: 0.9559
z score with rmse: 0.9559
RMSE: 0.9578
baseline with rmse: 0.9578


Among *item-based* kNNs best performing was KNNBaseline with RMSE = 0.9578.<br>
Among *user-based* kNNs best performing was also KNNBaseline with RMSE = 0.9462.

In [29]:
def knn_prediction(trainset):
    knn = KNNBaseline(verbose = False, sim_options = {"name":"cosine", "user_based":True})
    knn.fit(trainset)
    predictions = knn.test(testset)
    error = accuracy.rmse(predictions)
    
    return error

## Scenario 1

**Strategy**: Real users adding 5.0 ratings for a particular movie (i.e. Men in Black). <br>
Vary the number of users, who add fake ratings, e.g. 1%, 10% or 50% of the total number of users. <br>
Check how it affects overall prediction quality as well as predictions for particular user who didn't rate the movie before.

In [62]:
#notice the total number of users
num_users = len(train_df["userID"].unique())

#extract unique ids from train dataframe to iterate over them
ids = train_df["userID"].unique()

def fake_ratings(percent = 0.01):
    #calculate the number of new ratings we need to add
    num_new_ratings = int(0.01*num_users)
    
    #counter to keep track of how many ratings we added
    count = 0
    
    #create a copy of training dataframe
    fake_train_df = train_df.copy()
    
    #iterate over each user
    for user_id in ids:
        #extract ratings for particular user
        user_df = train_df[train_df["userID"] == user_id]
        
        #if user didn't rate the movie, then append the 5.0 rating to fake_train dataframe
        #and increment the counter
        if 257 not in user_df["itemID"].unique():
            count+=1
            temp_df = pd.DataFrame([[user_id, 257, 5.0]], columns = ["userID", "itemID", "rating"])
            fake_train_df = fake_train_df.append(temp_df, ignore_index = True)

        #stop iterating over users when we have enough fake ratings
        if count>num_new_ratings:
            break
            
    #convert dataframe with fake ratings into object of Trainset class
    fake_train_dataset = Dataset.load_from_df(fake_train_df, reader)
    fake_trainset = fake_train_dataset.build_full_trainset()
    
    #run svd with fake users and check out the error
    error_svd = svd_prediction(fake_trainset)
    print("svd error: {:.4f}".format(error_svd))
    
    #same for knn
    error_knn = knn_prediction(fake_trainset)
    print("knn error: {:.4f}".format(error_knn))

In [63]:
fake_ratings(0.01)

RMSE: 0.9502
svd error: 0.9502
RMSE: 0.9461
knn error: 0.9461


In [64]:
fake_ratings(0.1)

RMSE: 0.9526
svd error: 0.9526
RMSE: 0.9461
knn error: 0.9461


In [65]:
fake_ratings(0.5)

RMSE: 0.9521
svd error: 0.9521
RMSE: 0.9461
knn error: 0.9461


## Scenario 2

**Strategy**: Add fake users, who got paid to place 5.0 ratings for a particular movie.<br>
Those fake users never rated anything before, so when we add them, they only have 5.0 rating for the movie we chose.<br>
Also add 1%,10% and 50% of fake users from the total number of users.<br>
Use the same popular and recent movie from *Scenario 1*, i.e. Men in Black, id = 257. <br>
Again, check how it affects overall prediction quality as well as predictions for particular user.

In [32]:
#notice the last userID, then just add fake users with ID's bigger than that
num_users = len(train_df["userID"].unique())

def fake_users(percent = 0.01):
    #make a copy of original train dataframe
    fake_train_df = train_df.copy()
    
    #estimate the number of fake users to add
    num_fake_users = int(percent*num_users)
    
    #actually add fake users
    for i in range((num_users+1), (num_users+num_fake_users+1)):
        temp_df = pd.DataFrame([[i, 257, 5.0]], columns = ["userID", "itemID", "rating"])
        fake_train_df = fake_train_df.append(temp_df, ignore_index = True)
        
    #convert dataframe with fake users into object of Trainset class
    fake_train_dataset = Dataset.load_from_df(fake_train_df, reader)
    fake_trainset = fake_train_dataset.build_full_trainset()
    
    #run svd with fake users and check out the error
    error_svd = svd_prediction(fake_trainset)
    print("svd error: {:.4f}".format(error_svd))
    
    #same for knn
    error_knn = knn_prediction(fake_trainset)
    print("knn error: {:.4f}".format(error_knn))

In [33]:
fake_users(0.01)

RMSE: 0.9512
svd error: 0.9512
RMSE: 0.9462
knn error: 0.9462


In [34]:
fake_users(0.1)

RMSE: 0.9533
svd error: 0.9533
RMSE: 0.9462
knn error: 0.9462


In [35]:
fake_users(0.5)

RMSE: 0.9529
svd error: 0.9529
RMSE: 0.9462
knn error: 0.9462
