# Dataset Preparation

In [71]:
# Importing Libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Libraries for metrics
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error,mean_squared_error, r2_score #Import all the necessary model evaluation metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score #Import all the necessary model evaluation metrics
import shap

In [72]:
# Importing Datasets
jester_df = pd.read_csv(r"jester_ratings.csv")
jester_text_df = pd.read_csv(r"jester_items.csv")

# Collaborative Filter SVD

### Preparing data

In [73]:
# Counting Users that have not rated more than 50 jokes
usercount = jester_df[["jokeId", "userId"]].groupby("userId").count()
print("There are ", usercount.shape[0], "unique users")
usercount = usercount[usercount["jokeId"]>50]
print("There are ", usercount.shape[0], "unique users that have rated more than 50 jokes")

# Counting Jokes that do not have more than 1000 ratings
jokecount = jester_df[["jokeId", "userId"]].groupby("jokeId").count()
print("There are ", jokecount.shape[0], "unique jokes")
jokecount = jokecount[jokecount["userId"]>1000]
print("There are ", jokecount.shape[0], "unique jokes that have more than 1000 ratings")

# Removing the unactive users and unpopular jokes from the df
jester_df = jester_df[jester_df["userId"].isin(usercount.index) & jester_df["jokeId"].isin(jokecount.index)]
print("\nSubset of data: ")
print(jester_df)

There are  59132 unique users
There are  10291 unique users that have rated more than 50 jokes
There are  140 unique jokes
There are  128 unique jokes that have more than 1000 ratings

Subset of data: 
         userId  jokeId  rating
1             1       7  -9.281
2             1       8  -9.281
3             1      13  -6.781
4             1      15   0.875
5             1      16  -9.656
...         ...     ...     ...
1761434   63978      57  -8.531
1761435   63978      24  -9.062
1761436   63978     124  -9.031
1761437   63978      58  -8.656
1761438   63978      44  -8.438

[945825 rows x 3 columns]


In [74]:
# Analyzing the sparse matrix
n_users = len(jester_df.userId.unique())
n_jokes = len(jester_df.jokeId.unique())
print("Number of unique users: ", n_users)
print("Number of unique jokes: ", n_jokes)
print("The full rating matrix will have ", n_users*n_jokes, "elements.")
print("--------")
print("Number of ratings: ", len(jester_df))
print("Therefore: only ", len(jester_df) / (n_users*n_jokes) * 100, "% of the matrix is filled")
print("This is a sparse matrix")


Number of unique users:  10291
Number of unique jokes:  128
The full rating matrix will have  1317248 elements.
--------
Number of ratings:  945825
Therefore: only  71.80310769118647 % of the matrix is filled
This is a sparse matrix


### Pivotting the dataset and creating a matrix

In [75]:
jester_matrix = jester_df.pivot_table(index=('userId'), columns=('jokeId'), values=('rating')).fillna(0)
matrix = jester_matrix.values
matrix

array([[-9.281, -9.281, -6.781, ...,  0.   ,  0.   ,  0.   ],
       [ 4.75 , -5.906, -0.406, ...,  0.   ,  0.   ,  0.   ],
       [-7.438, -0.812, -3.438, ...,  0.   ,  0.   ,  0.   ],
       ...,
       [-5.094, -6.031,  1.344, ...,  4.688,  2.062,  5.312],
       [ 1.375,  2.406,  3.562, ...,  1.812,  1.   ,  1.531],
       [-7.906, -7.594, -7.594, ...,  8.781,  8.781,  7.562]])

### Calculate the ratings

In [76]:
# Compute SVD
u, sigma, vt = np.linalg.svd(matrix, full_matrices=False)

# Function to use the cosine similarity to find the most similar jokes
def cosine_similarity(v,u):
    return (v @ u)/ (np.linalg.norm(v) * np.linalg.norm(u))

# https://heartbeat.comet.ml/recommender-systems-with-python-part-iii-collaborative-filtering-singular-value-decomposition-5b5dcb3f242b 
# Convert sigma into diagonal matrix form because we are using it to leverage matrix multiplication to get predictions
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(u, sigma), vt) #+ jester_df.mean().reshape(-1, 1)

In [77]:
all_user_predicted_ratings

array([[-9.28100000e+00, -9.28100000e+00, -6.78100000e+00, ...,
         3.81835015e-14,  2.88632113e-14,  3.52354000e-14],
       [ 4.75000000e+00, -5.90600000e+00, -4.06000000e-01, ...,
         3.00403946e-14,  3.00926332e-14,  3.26715538e-14],
       [-7.43800000e+00, -8.12000000e-01, -3.43800000e+00, ...,
        -3.92360696e-15,  7.12553917e-15,  3.78360284e-15],
       ...,
       [-5.09400000e+00, -6.03100000e+00,  1.34400000e+00, ...,
         4.68800000e+00,  2.06200000e+00,  5.31200000e+00],
       [ 1.37500000e+00,  2.40600000e+00,  3.56200000e+00, ...,
         1.81200000e+00,  1.00000000e+00,  1.53100000e+00],
       [-7.90600000e+00, -7.59400000e+00, -7.59400000e+00, ...,
         8.78100000e+00,  8.78100000e+00,  7.56200000e+00]])

In [78]:
matrix

array([[-9.281, -9.281, -6.781, ...,  0.   ,  0.   ,  0.   ],
       [ 4.75 , -5.906, -0.406, ...,  0.   ,  0.   ,  0.   ],
       [-7.438, -0.812, -3.438, ...,  0.   ,  0.   ,  0.   ],
       ...,
       [-5.094, -6.031,  1.344, ...,  4.688,  2.062,  5.312],
       [ 1.375,  2.406,  3.562, ...,  1.812,  1.   ,  1.531],
       [-7.906, -7.594, -7.594, ...,  8.781,  8.781,  7.562]])

In [79]:
sigma

array([[3269.34955717,    0.        ,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        , 1350.92210243,    0.        , ...,    0.        ,
           0.        ,    0.        ],
       [   0.        ,    0.        ,  800.45893301, ...,    0.        ,
           0.        ,    0.        ],
       ...,
       [   0.        ,    0.        ,    0.        , ...,  231.93978879,
           0.        ,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
         228.60555671,    0.        ],
       [   0.        ,    0.        ,    0.        , ...,    0.        ,
           0.        ,  223.40316915]])