# Dataset Preparation

In [12]:
# Importing Standard Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Surprise library
from surprise import Reader, Dataset, accuracy, SVD, NMF, KNNBasic, KNNWithMeans
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [13]:
# Importing Datasets
jester_df = pd.read_csv(r"jester_ratings.csv")
jester_text_df = pd.read_csv(r"jester_items.csv")

# Code with Surprise and SVD

In [14]:
# Hyperparameter tuning with GridSearch
#parameters = {"n_factors": [100, 150, 200],
#              "reg_all": [0.04, 0.06],
#              "n_epochs": [5, 10, 15],
#              "lr_all": [.002, .005, .01]}
#gridsvd = GridSearchCV(SVD, param_grid=parameters, n_jobs=-1)
#gridsvd.fit(data)
#print(gridsvd.best_score)
#print(gridsvd.best_params)

In [15]:
# Loading the Data
reader = Reader(rating_scale=(-10, 10))
data = Dataset.load_from_df(jester_df[['userId', 'jokeId', 'rating']], reader)

In [16]:
svd = SVD(n_factors=100, reg_all=0.06, n_epochs=5, lr_all=.005)

In [17]:
# Train test split
trainset, testset = train_test_split(data, test_size=0.2)

# Fitting KNNBasic model to training set
svd.fit(trainset)

# Test the model on the testing set
predictions = svd.test(testset)

In [18]:
# Printing users actual ratings and the predicted ratings
for prediction in predictions[0:5]:
    print(prediction)

user: 38307      item: 16         r_ui = 9.47   est = -1.72   {'was_impossible': False}
user: 35938      item: 39         r_ui = 5.00   est = -8.89   {'was_impossible': False}
user: 47316      item: 148        r_ui = -0.06   est = 10.00   {'was_impossible': False}
user: 23656      item: 69         r_ui = 8.28   est = -0.01   {'was_impossible': False}
user: 21600      item: 62         r_ui = 3.69   est = 2.25   {'was_impossible': False}


In [19]:
# Print the performance metrics
accuracy.rmse(predictions)

RMSE: 4.2614


4.261393768260221

In [20]:
# Inspiration from https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
# Function to recommend an user a number of recommendations
def recommend_SVD(userId, num_recommendations):
    user_df = jester_text_df.copy()
    user_df = user_df.reset_index()

    data = Dataset.load_from_df(jester_df[['userId', 'jokeId', 'rating']], reader)

    trainset = data.build_full_trainset()
    svd.fit(trainset)

    user_df['Estimate_Score'] = user_df['jokeId'].apply(lambda x: svd.predict(userId, x).est)

    user_df = user_df.drop('jokeId', axis = 1)

    user_df = user_df.sort_values('Estimate_Score', ascending=False)
    print(user_df.head(num_recommendations))

In [21]:
# Recommend user 100 with the top 10 jokes
recommend_SVD(100, 10)

     index                                           jokeText  Estimate_Score
16      16  How many men does it take to screw in a light ...        5.319786
12      12  They asked the Japanese visitor if they have e...        4.991301
61      61  A group of  managers were given the assignment...        4.405402
28      28  An old Scotsmen is sitting with a younger Scot...        4.272971
6        6  How many feminists does it take to screw in a ...        4.228431
67      67  A man piloting a hot air balloon discovers he ...        4.163693
131    131  Mickey Mouse is having a nasty divorce with Mi...        4.153905
126    126  A little boy goes to his dad and asks, "What i...        3.871449
35      35  A guy walks into a bar, orders a beer and says...        3.844877
18      18  Q: If a person who speaks three languages is c...        3.696313


# Predictive accuracy metrics

In [22]:
# Run 5-fold cross-validation and print results
cross_validate(svd, data, measures=["RMSE", "MAE"], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    4.2711  4.2650  4.2679  4.2771  4.2881  4.2738  0.0082  
MAE (testset)     3.1345  3.1355  3.1394  3.1447  3.1493  3.1407  0.0056  
Fit time          3.37    3.57    3.59    3.60    3.61    3.55    0.09    
Test time         2.51    2.08    2.56    2.13    2.15    2.29    0.21    


{'test_rmse': array([4.27110858, 4.26502461, 4.26786347, 4.27710867, 4.28813058]),
 'test_mae': array([3.13454857, 3.13554211, 3.13941935, 3.14473311, 3.14934597]),
 'fit_time': (3.365025520324707,
  3.572671890258789,
  3.591733694076538,
  3.602407693862915,
  3.607764959335327),
 'test_time': (2.5077929496765137,
  2.0768637657165527,
  2.5604665279388428,
  2.12835431098938,
  2.153841257095337)}