# Dataset Preparation

In [17]:
# Importing Standard Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Surprise library
from surprise import Reader, Dataset, accuracy, SVD, NMF, KNNBasic, KNNWithMeans
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV

In [18]:
# Importing Datasets
jester_df = pd.read_csv(r"jester_ratings.csv")
jester_text_df = pd.read_csv(r"jester_items.csv")

# Code with Surprise and KNN

In [19]:
# Hyperparameter tuning with GridSearch
#parameters = {"name": ["cosine", "pearson"],
#              "user_based": [True, False],
#              "min_support": [True, False],
#              "min_k": [2, 5, 10]}
#gridknn = GridSearchCV(KNNBasic, param_grid=parameters, n_jobs=-1)
#gridknn.fit(data)
#print(gridknn.best_score)
#print(gridknn.best_params)

In [20]:
# Loading the dataset
reader = Reader(rating_scale=(-10, 10))
data = Dataset.load_from_df(jester_df[['userId', 'jokeId', 'rating']][:200000], reader) # using only 200k rows for faster run time

In [21]:
# Fitting a KNNBasic model to the training set
knn = KNNBasic(sim_options={"name": 'pearson',
                            "user_based": True,
                            "min_support": True,
                            "min_k": 2})

In [22]:
# Train test split
trainset, testset = train_test_split(data, test_size=0.2)

# Fitting KNNBasic model to training set
knn.fit(trainset)

# Test the model on the testing set
predictions = knn.test(testset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [23]:
# Printing users actual ratings and the predicted ratings
for prediction in predictions[0:5]:
    print(prediction)

user: 3213       item: 59         r_ui = 3.25   est = 0.10   {'actual_k': 40, 'was_impossible': False}
user: 1555       item: 117        r_ui = -5.44   est = 0.98   {'actual_k': 40, 'was_impossible': False}
user: 1283       item: 35         r_ui = 8.41   est = 4.36   {'actual_k': 40, 'was_impossible': False}
user: 6093       item: 143        r_ui = 1.28   est = 3.01   {'actual_k': 40, 'was_impossible': False}
user: 6172       item: 63         r_ui = 3.16   est = 4.28   {'actual_k': 40, 'was_impossible': False}


In [24]:
# Print the performance metrics
accuracy.rmse(predictions)

RMSE: 5.3557


5.355720829049903

In [25]:
# Inspiration from https://www.kaggle.com/code/laowingkin/netflix-movie-recommendation
# Defining a function to recommend jokes to an user
def recommend_KNN(userId, num_recommendations):
    user_df = jester_text_df.copy()
    user_df = user_df.reset_index()

    data = Dataset.load_from_df(jester_df[['userId', 'jokeId', 'rating']][:200000], reader)

    trainset = data.build_full_trainset()
    knn.fit(trainset)

    user_df['Estimate_Score'] = user_df['jokeId'].apply(lambda x: knn.predict(userId, x).est)

    user_df = user_df.drop('jokeId', axis = 1)

    user_df = user_df.sort_values('Estimate_Score', ascending=False)
    print(user_df.head(num_recommendations))

In [26]:
# Recommend user 100 with the top 10 jokes
recommend_KNN(100, 10)

Computing the pearson similarity matrix...
Done computing similarity matrix.
     index                                           jokeText  Estimate_Score
16      16  How many men does it take to screw in a light ...        5.168930
12      12  They asked the Japanese visitor if they have e...        4.744469
26      26  Clinton returns from a vacation in Arkansas an...        4.057151
131    131  Mickey Mouse is having a nasty divorce with Mi...        4.043273
34      34  An explorer in the deepest Amazon suddenly fin...        4.001111
31      31  A man arrives at the gates of heaven. St. Pete...        3.977635
88      88  A radio conversation of a US naval \nship with...        3.917150
125    125  A Briton, a Frenchman and a Russian are viewin...        3.805353
47      47  The graduate with a Science degree asks, "Why ...        3.764531
104    104  A couple of hunters are out in the woods in th...        3.702548


In [27]:
# Function where the user can input their user id and the number of recommendation they need
def recommend_KNN_input():
    userId = int(input("What is your User ID?"))
    num_recommendations = int(input("How many recommendations do you want?"))
    user_df = jester_text_df.copy()
    user_df = user_df.reset_index()

    data = Dataset.load_from_df(jester_df[['userId', 'jokeId', 'rating']][:200000], reader)

    trainset = data.build_full_trainset()
    knn.fit(trainset)

    user_df['Estimate_Score'] = user_df['jokeId'].apply(lambda x: knn.predict(userId, x).est)

    user_df = user_df.drop('jokeId', axis = 1)

    user_df = user_df.sort_values('Estimate_Score', ascending=False)
    print(user_df.head(num_recommendations))

In [28]:
# Function where the user gets a random joke and depending if they like or dislike the joke, they get another similar joke
import random

def recommend_KNN_input_2():
    user_df = jester_text_df.copy()
    user_df = user_df.reset_index()

    data = Dataset.load_from_df(jester_df[['userId', 'jokeId', 'rating']][:200000], reader)

    trainset = data.build_full_trainset()
    knn.fit(trainset)

    # Randomly select a joke
    random_jokeId = random.randint(1, 128)
    random_joke_text = user_df.loc[user_df['jokeId'] == random_jokeId, 'jokeText'].item()
    print("Here is your joke: ")
    print(random_joke_text)

    accum = 0
   
    while True:
        liked = input("Do you want a similair joke? Answer with Y or N")
        # Ask the user to continue or leave
        if liked.lower() == 'y':
            user_df['Estimate_Score'] = user_df['jokeId'].apply(lambda x: knn.predict(random_jokeId, x).est)
            user_df = user_df.sort_values('Estimate_Score', ascending=False)
            print("Here is another joke: ")
            print(user_df['jokeText'][accum])
            accum += 1
        else:
            print("Goodbye")
            break

# Predictive accuracy metrics

In [29]:
# Run 5-fold cross-validation and print results
cross_validate(knn, data, measures=["RMSE", "MAE"], cv=5, verbose=True) # Loading time ~10 minutes

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    5.3270  5.3665  5.3391  5.3497  5.3293  5.3423  0.0145  
MAE (testset)     4.3672  4.4099  4.3710  4.3877  4.3714  4.3815  0.0159  
Fit time          27.82   27.83   27.67   28.07   28.02   27.88   0.15    
Test time         33.12   32.76   33.97   33.08   33.90   33.37   0.48    


{'test_rmse': array([5.32701051, 5.36645845, 5.33906857, 5.34970781, 5.32926108]),
 'test_mae': array([4.36723535, 4.40988122, 4.37099741, 4.38770515, 4.3714317 ]),
 'fit_time': (27.815629243850708,
  27.83255434036255,
  27.666017055511475,
  28.07194185256958,
  28.018086433410645),
 'test_time': (33.11849093437195,
  32.758440256118774,
  33.96618390083313,
  33.084569215774536,
  33.900392055511475)}