In [106]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, SVD, SVDpp, accuracy
from surprise.model_selection import train_test_split

column_names = ['UserID','MovieID', 'Rating','Timestamp']
data = pd.read_csv('C:\\Users\\piotr\\Downloads\\ml-100k\\u.data', sep="\t", names=column_names)
data

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [39]:
nr_of_users = 943
nr_of_movies = 1682

matrix = np.full((nr_of_users, nr_of_movies), np.nan, dtype=float)
def add_to_matrix(row):
    matrix[row["UserID"] - 1][row["MovieID"] - 1] = row["Rating"]

data.apply(add_to_matrix, axis = 1)

matrix

array([[ 5.,  3.,  4., ..., nan, nan, nan],
       [ 4., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [ 5., nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan,  5., nan, ..., nan, nan, nan]])

In [12]:

def colaborative_filtering(matrix, limit_users = nr_of_users):
  new_matrix = np.full((limit_users, nr_of_movies), np.nan, dtype=float)
  def get_similarities_vec(v1, v2):
    v_size = v1.shape[0]
    mask = [not(np.isnan(v1[i])) and not(np.isnan(v2[i])) for i in range(v_size)]
    v1_similarities = v1[mask]
    v2_similarities = v2[mask]
    return v1_similarities, v2_similarities

  def cosine_sim(v1, v2):
      dot_product = np.dot(v1, v2)
      norm_product = np.linalg.norm(v1) * np.linalg.norm(v2)
      if norm_product == 0:
          return 0.0
      return dot_product / norm_product

  ## return vector of cos similarities for all other users
  def get_all_similarities(matrix, user1_nr):
    cos_vector = np.zeros(nr_of_users)
    def vectors_cos(user2_nr):
      user1, user2 = matrix[user1_nr], matrix[user2_nr]
      user1_sim, user2_sim = get_similarities_vec(user1, user2)
      if user1_nr != user2_nr:
        cos_vector[user2_nr] = cosine_sim(user1_sim, user2_sim)
    for user_nr in range(matrix.shape[0]):
      vectors_cos(user_nr)

    return cos_vector

  ## return vector of new ratings for given user
  def new_ratings(predict_user_nr):
      all_similarities = get_all_similarities(matrix, predict_user_nr)
      new_ratings = np.zeros(nr_of_movies, dtype=float)
      previous_ratings = matrix[predict_user_nr]
      similarity_sum = np.zeros(nr_of_movies)

      for user_nr in range(nr_of_users):
          if user_nr == predict_user_nr:
              continue

          user_ratings = matrix[user_nr]
          sim = all_similarities[user_nr]
          mask = [bool(np.isnan(previous_ratings[i])) and not(np.isnan(user_ratings[i])) for i in range(nr_of_movies)]
          new_ratings[mask] += user_ratings[mask] * sim
          similarity_sum[mask] += sim

      with np.errstate(divide='ignore', invalid='ignore'):
        predicted = np.true_divide(new_ratings, similarity_sum)
        predicted[similarity_sum == 0] = np.nan

      mask = [np.isnan(predicted[i]) for i in range(nr_of_movies)]
      predicted[mask] = previous_ratings[mask]
      return predicted

  for user_nr in range(new_matrix.shape[0]):
    new_matrix[user_nr] = new_ratings(user_nr)
  return new_matrix

new_ratings = colaborative_filtering(matrix, limit_users=25)
new_ratings

array([[5.        , 3.        , 4.        , ..., 2.        , 3.        ,
        3.        ],
       [4.        , 3.20766341, 3.03550597, ..., 2.        , 3.        ,
        3.        ],
       [3.88658948, 3.21110034, 3.04406695, ..., 2.        , 3.        ,
        3.        ],
       ...,
       [5.        , 3.21247108, 3.03103246, ..., 2.        , 3.        ,
        3.        ],
       [3.88194469, 3.20956107, 3.03655095, ..., 2.        , 3.        ,
        3.        ],
       [5.        , 3.21313913, 3.03815886, ..., 2.        , 3.        ,
        3.        ]])

In [21]:
for nr, new_rating in enumerate(new_ratings):
    print(f"previous val: {matrix[nr][110:120]}")
    print(f"new ratings: {new_rating[110:120]}")
    print()

previous val: [5. 1. 5. 5. 5. 3. 3. 3. 5. 1.]
new ratings: [5. 1. 5. 5. 5. 3. 3. 3. 5. 1.]

previous val: [ 4. nan nan nan nan nan nan nan nan nan]
new ratings: [4.         2.43375069 4.11115734 4.44990081 3.73282881 3.82780177
 3.69372438 3.21684492 4.49540061 2.24248622]

previous val: [nan nan nan nan nan nan nan nan nan nan]
new ratings: [3.49608059 2.4812708  4.08343002 4.43780051 3.65733559 3.8097227
 3.69162611 3.22281981 4.31696279 2.24157826]

previous val: [nan nan nan nan nan nan nan nan nan nan]
new ratings: [3.50647861 2.47234156 4.24101216 4.45509687 3.74521707 3.83915767
 3.69355639 3.22458816 4.50160864 2.24482926]

previous val: [nan nan nan nan nan nan nan nan nan nan]
new ratings: [3.4859839  2.43806161 4.10341191 4.45366968 3.74140184 3.84441659
 3.69318365 3.22138553 4.5164253  2.24578914]

previous val: [ 2. nan nan nan nan nan  2. nan nan nan]
new ratings: [2.         2.44600892 4.10869523 4.44995129 3.72701458 3.83117506
 2.         3.20721499 4.50044577 2.23452

In [131]:
def split_by_time(data, ratings_to_predict = None, user_test_size = None, experience_split = None):
    pd_test = pd.DataFrame(columns=column_names)
    pd_train = pd.DataFrame(columns=column_names)
    user_to_experience = pd.DataFrame(columns=["UserID", "HistoryLen"])
    nr_of_users = 0
    for user_nr, ratings in data.groupby("UserID"):
        ratings = ratings.sort_values(by='Timestamp', ascending=False)
        limit = None

        if ratings_to_predict:
            limit = ratings_to_predict
        if user_test_size:
            nr_of_ratings = ratings.shape[0]
            limit = round(user_test_size * nr_of_ratings + 0.5)

        pd_test = pd.concat([pd_test,ratings[:limit]])
        pd_train = pd.concat([pd_train,ratings[limit:]])
        if experience_split:
            new_row = pd.DataFrame([{'UserID': user_nr, 'HistoryLen': ratings.shape[0]}])
            user_to_experience = pd.concat([user_to_experience, new_row])
        nr_of_users += 1

    return pd_train, pd_test, user_to_experience, nr_of_users


In [132]:
## acc rate using time-based split
def get_acc(model, ratings_to_predict = None, user_test_size = None, experience_split = None):
    pd_train, pd_test, user_to_experience, nr_of_users = split_by_time(data, ratings_to_predict, user_test_size, experience_split)
    results = {x:None for x in ["testset", "low_experience", "top_experience"]}

    reader = Reader(rating_scale=(1, 5))
    trainset = Dataset.load_from_df(pd_train[['UserID', 'MovieID', 'Rating']], reader).build_full_trainset()
    testset = list(zip(pd_test['UserID'], pd_test['MovieID'], pd_test['Rating']))

    model.fit(trainset)
    predictions = model.test(testset)
    results["testset"] = accuracy.rmse(predictions)
    if experience_split:
        user_to_experience = user_to_experience.sort_values(by="HistoryLen", ascending=False)
        top_experience_users = pd_test[pd_test["UserID"].isin(user_to_experience[:round(nr_of_users*experience_split)]["UserID"])]
        low_experience_users = pd_test[pd_test["UserID"].isin(user_to_experience[round(nr_of_users * (1 - experience_split)):]["UserID"])]

        top_experience_testset = list(zip(top_experience_users['UserID'], top_experience_users['MovieID'], top_experience_users['Rating']))
        low_experience_testset = list(zip(low_experience_users['UserID'], low_experience_users['MovieID'], low_experience_users['Rating']))

        top_experience_predictions = model.test(top_experience_testset)
        low_experience_predictions = model.test(low_experience_testset)

        results["top_experience"] = accuracy.rmse(top_experience_predictions)
        results["low_experience"] = accuracy.rmse(low_experience_predictions)

    return results

model = SVD()
print(get_acc(model, user_test_size = 0.1, experience_split=0.2))
print(get_acc(model, user_test_size = 0.2, experience_split=0.2))
print(get_acc(model, user_test_size = 0.5, experience_split=0.2))
print(get_acc(model, user_test_size = 0.8, experience_split=0.2))
print(get_acc(model, user_test_size = 0.99, experience_split=0.2))

RMSE: 0.9889
RMSE: 0.9612
RMSE: 1.0609
{'testset': 0.9888632638091445, 'low_experience': 1.060880797257871, 'top_experience': 0.961187420318332}
RMSE: 0.9964
RMSE: 0.9798
RMSE: 1.0648
{'testset': 0.9963754244337171, 'low_experience': 1.064843640066458, 'top_experience': 0.9798404804655072}
RMSE: 1.0152
RMSE: 1.0154
RMSE: 1.0959
{'testset': 1.0152070524046635, 'low_experience': 1.0958663530910715, 'top_experience': 1.015360016145473}
RMSE: 1.0505
RMSE: 1.0563
RMSE: 1.0791
{'testset': 1.050532920057816, 'low_experience': 1.0791023133081654, 'top_experience': 1.0562605096022974}
RMSE: 1.1189
RMSE: 1.1338
RMSE: 1.1237
{'testset': 1.1189199512748638, 'low_experience': 1.1237318841237183, 'top_experience': 1.13382029902064}


In [151]:
def get_top_n(predictions, n):
    user_pred = {}
    top_n = {}
    for uid, iid, _, est, _ in predictions:
        if uid in user_pred:
            user_pred[uid].append((iid, est))
        else:
            user_pred[uid] = [(iid, est)]

    for uid, user_ratings in user_pred.items():
        top_n[uid] = sorted(user_ratings, key=lambda x: x[1], reverse=True)[:n]

    return top_n

def get_coverage(predictions, n, nr_of_movies):
    top_n = get_top_n(predictions, n)
    recommended_items = set()
    for user_ratings in top_n.values():
        for iid, _ in user_ratings:
            recommended_items.add(iid)
    return len(recommended_items) / nr_of_movies

## liczone z https://en.wikipedia.org/wiki/Diversity_index#Simpson_index
def get_diversity(predictions, n, nr_of_movies):
    top_n = get_top_n(predictions, n)
    recommended_items = {}
    nr_of_chosen_items = 0
    for user_ratings in top_n.values():
        for iid, _ in user_ratings:
            if iid in recommended_items:
                recommended_items[iid] += 1
            else:
                recommended_items[iid] = 1
            nr_of_chosen_items += 1
    simpson_index = 0
    for val in recommended_items.values():
        simpson_index += (val/nr_of_chosen_items)**2
    return simpson_index

reader = Reader(rating_scale=(1, 5))
trainset = data.build_full_trainset()

model = SVD()
model.fit(trainset)

anti_testset = trainset.build_anti_testset()

predictions = model.test(anti_testset)

coverage = get_coverage(predictions, 10, nr_of_movies)
print(f"Item coverage: {coverage:.2%}")

diversity = get_diversity(predictions, 10, nr_of_movies)
print(f"Item diversity: {diversity:.2%}")

Item coverage: 16.83%
Item diversity: 2.24%
