# Lab 3 : Matrix factorization

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/biodatlab/xlab-recommendation/blob/notebook/03_matrix_factorization_movie.ipynb)

* Dataset: https://grouplens.org/datasets/movielens/latest/
  * สามารถ download dataset เองตรงๆได้จาก link ด้านบน แล้วเปลี่ยน `rating_path` และ `movies_path` ตามที่อยู่ของไฟล์ได้
  * ใน notebook นี้จะทำการ download dataset ที่ฝากไว้ใน google drive ด้วย `gdown`
* Objectives
  * แนะนำหนังใหม่ที่ user ไม่เคยดู จากหนังที่ user เคยให้คะแนน (rating)
* Notes
  * scikit-learn Non-Negative Matrix Factorization: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html
  * scipy sparse matrix: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
  * pandas Categorical: https://pandas.pydata.org/docs/reference/api/pandas.Categorical.html
  * gradio interface: https://www.gradio.app/docs/interface
  * gdown: 

## 0. Install Essential Libraries

In [None]:
! pip install pandas
! pip install numpy
! pip install scikit-learn
! pip install scipy
! pip install gradio
! pip install gdown

## 1. Data Preparation

In [None]:
# import essential library

import os
import os.path as op

import numpy as np
import pandas as pd
from sklearn.decomposition import NMF
from scipy.sparse import csr_matrix

In [None]:
# load dataset from drive

import gdown

url = "https://drive.google.com/drive/folders/1dtGM-kA_3ylXZntgWv1xMUSVlZhSW5jy?usp=sharing"
gdown.download_folder(url, use_cookies=False)

In [59]:
# read dataset

rating_path = op.join(os.getcwd(),"movielens-latest-dataset/ratings.csv")
movies_path = op.join(os.getcwd(),"movielens-latest-dataset/movies.csv")

rating_df = pd.read_csv(
    ratingFile_path,
    usecols=["userId", "movieId", "rating"],
    dtype={"userId": str, "movieId": str},
)
movies_df = pd.read_csv(
    moviesFile_path,
    usecols=["movieId", "title", "genres"],
    dtype={"movieId": str, "title": str},
)

In [60]:
# check rating dataframe
rating_df.head()

   userId  movieId  rating   timestamp
0       1       31     2.5  1260759144
1       1     1029     3.0  1260759179
2       1     1061     3.0  1260759182
3       1     1129     2.0  1260759185
4       1     1172     4.0  1260759205


In [62]:
# check movies dataframe
movies_df.head()

In [None]:
# check number of users
rating_df["userId"].nunique()

In [None]:
# merge title and genres from movies df for visualization

rating_df = rating_df.merge(movies_df, how='inner', on ="movieId")

In [None]:
# visualize merged dataframe

rating_df.head()

In [None]:
# change userId and movieId into categorical codes for sparse matrix

rating_df["rating"] = rating_df["rating"].astype(float)

rating_df["userId"] = rating_df["userId"].astype("category")

rating_df["movieId"] = rating_df["movieId"].astype("category")

In [None]:
# create sparse matrix

X_rating = csr_matrix(
    (
        rating_df["rating"].astype(float),
        (rating_df["userId"].cat.codes, rating_df["movieId"].cat.codes),
    )
)

X_rating = X_rating.tocsr()

## 2: Matrix Factorization

In [None]:
# Create NMF Model
model = NMF(n_components=10, init="nndsvd", verbose=1)

W = model.fit_transform(X_rating)
H = model.components_
H = np.array(H)
W = np.array(W)

In [None]:
# select one user for testing

selected_user_id = '21'
selected_user_df = rating_df[rating_df["userId"] == selected_user_id]
selected_user_df.sort_values(by="rating",ascending=False)

In [76]:
# Reconstruct user-item matrix from selected user features
predicted_rating = np.dot(W[selected_user_df["userId"].cat.codes.values[0]],H)

In [None]:
# get sorted index from predicted rating (low -> high)

sort_rating_index = np.argsort(predicted_rating)

# flip sorted index (high -> low)

sort_rating_index = np.flip(sort_rating_index)

In [None]:
# select only top k movies

k = 20
selected_movie_cat_code = sort_rating_index[:k]
selected_predict_rating = predicted_rating[selected_movie_cat_code]

# get movieId from categorical codes

rec_movie_id = pd.Categorical.from_codes(selected_movie_cat_code, categories=rating_df["movieId"].cat.categories)

In [None]:
# create recommendation dataframe for visualize

rec_df = pd.DataFrame({
    "movieId": rec_movie_id,
    "predicted rating": selected_predict_rating,
})

rec_df = rec_df.merge(movies_df, how='inner', on='movieId')

In [None]:
rec_df.head()

In [None]:
# interactive application input userId -> show previous ratings, recommend new movies

import gradio as gr

def recommend_movie(user_id):

    if (rating_df["userId"] == user_id).any():
        selected_user_df = rating_df[rating_df["userId"] == user_id].sort_values(by="rating",ascending=False)

        predicted_rating = np.dot(W[selected_user_df["userId"].cat.codes.values[0]],H)
        sort_rating_cat_code = np.flip(np.argsort(predicted_rating))[:20]
        selected_predict_rating = predicted_rating[sort_rating_cat_code]

        rec_movie_id = pd.Categorical.from_codes(sort_rating_cat_code, categories=rating_df["movieId"].cat.categories)
        
        rec_df = pd.DataFrame({
            "movieId": rec_movie_id,
            "predicted rating": selected_predict_rating,
        })
        rec_df = rec_df.merge(movies_df, how='inner', on='movieId')

        return selected_user_df.head(10),rec_df.head(10)

    else:
        raise gr.Error("User id not found")


demo = gr.Interface(
    fn=recommend_movie,
    inputs="text",
    outputs=[gr.DataFrame(label="Previous rate"),gr.DataFrame(label="Recommend")],
    examples=['21','50']
)

demo.launch()