In [None]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 48 kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619435 sha256=493dbee11bdee76a78fa226d9aeeb0189430c0cc1bb07512f1e798c512aa460e
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
from surprise import Dataset
from surprise import Reader

drive.mount('/content/drive')
ratings = pd.read_csv('/content/drive/My Drive/IIR_orientation/HW2/ratings_small.csv')
# ratings = pd.read_csv('ratings_small.csv')

Mounted at /content/drive


In [None]:
# df is original ratings_small.csv
df = ratings.drop(['timestamp'], axis=1)
reader = Reader(rating_scale=(0, 5))

# Loads Pandas dataframe
data = Dataset.load_from_df(df[["userId", "movieId", "rating"]], reader)

In [None]:
# Leave one out
from surprise.model_selection import GridSearchCV,LeaveOneOut

LOOCV = LeaveOneOut(n_splits=1, random_state=1)
for trainSet, testSet in LOOCV.split(data):
  trainSet = df[~df[['userId','movieId','rating']].apply(tuple, 1).isin(testSet)]

# Training Set After LeaveOneOut

In [None]:
trainSet

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
...,...,...,...
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5


# Testing Set After LeaveOneOut

In [None]:
pd.DataFrame(testSet, columns=['userId','movieId','rating'])

Unnamed: 0,userId,movieId,rating
0,1,1263,2.0
1,2,165,3.0
2,3,377,2.5
3,4,2114,5.0
4,5,5679,4.5
...,...,...,...
666,667,501,5.0
667,668,1221,5.0
668,669,2395,4.0
669,670,34,4.0


# Functions For Top-N Hit Rate

In [None]:
def getUser_100Movies(UserId):
  np.random.seed(1)
  return np.append(np.random.choice(np.setdiff1d(df['movieId'].unique(), np.array(df[df['userId']==UserId]['movieId'])),99,replace=False),testSet[UserId-1][1])

In [None]:
def UserTopN_IsHit(UserId, N, model):
  User_100Movies = getUser_100Movies(UserId)
  res = pd.DataFrame()
  res['movieId'] = User_100Movies
  res['pred_rating'] = [model.predict(UserId, movieId).est for movieId in User_100Movies]
  res = res.sort_values(by='pred_rating', ascending=False)[0:N]
  if User_100Movies[-1] in res['movieId'].unique():
    return True
  else:
    return False   

In [None]:
def TopN_HitRate(N, model):
  Hit = 0
  for UserId in df['userId'].unique():
    if UserTopN_IsHit(UserId, N, model):
      Hit+=1
  return Hit/len(df['userId'].unique())  

# Use similarity metric
[similarity measures](https://surprise.readthedocs.io/en/stable/similarities.html)

In [None]:
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV,LeaveOneOut

LOOCV = LeaveOneOut(n_splits=1, random_state=1)

sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True],
}

param_grid = {"sim_options": sim_options}

gs_KNNWithMeans = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=LOOCV, refit=True)
gs_KNNWithMeans.fit(data)

print(gs_KNNWithMeans.best_score["rmse"])
print(gs_KNNWithMeans.best_params["rmse"])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
0.8865727574921929
{'sim_options'

In [None]:
# Try to predict testSet's rating 

gs_KNNWithMeans.predict(3,377).est

3.3052537913019875

# Top-N Hit Rate(similarity metric)

In [None]:
print('Top-5 Hit Rate:',TopN_HitRate(5, gs_KNNWithMeans))

Top-5 Hit Rate: 0.044709388971684055


In [None]:
print('Top-3 Hit Rate:',TopN_HitRate(3, gs_KNNWithMeans))

Top-3 Hit Rate: 0.022354694485842028


# Use Matrix Factorization

In [None]:
from surprise import SVD


param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs_SVD = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=LOOCV, refit=True)

gs_SVD.fit(data)

print(gs_SVD.best_score["rmse"])
print(gs_SVD.best_params["rmse"])

0.8966791757731263
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [None]:
gs_SVD.predict(1,110).est

3.2663820816681466

# Top-N Hit Rate(Matrix Factorization)

In [None]:
print('Top-5 Hit Rate:',TopN_HitRate(5, gs_SVD))

Top-5 Hit Rate: 0.3055141579731744


In [None]:
print('Top-3 Hit Rate:',TopN_HitRate(3, gs_SVD))

Top-3 Hit Rate: 0.2429210134128167


# Save Model

In [None]:
import joblib

#save model
joblib.dump(gs_KNNWithMeans, '/content/drive/My Drive/IIR_orientation/HW2/KNNWithMeans_model.pkl')
joblib.dump(gs_SVD, '/content/drive/My Drive/IIR_orientation/HW2/SVD_model.pkl')


['/content/drive/My Drive/IIR_orientation/HW2/SVD_model.pkl']