In [10]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 25 00:44:54 2019

@author: ziyingfeng
"""
import numpy as np
import pandas as pd
import heapq
from surprise import Dataset, Reader
from surprise import SVD, NMF
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

    Using Surprise package instead of Sklearn
    Because Surprise has its own SGD, therefore it can deal with sparse matrix
    For Sklearn, we need to fill the NAN with numbers it is not practical here

## Import Data

In [27]:
columns = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv('ml-100k/u.data', sep = '\t', names = columns)
print(data.head())

   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [28]:
# In Surprise package we don't need this part
# Here just generate a utility matrix for reference
utilityMatrix = pd.pivot_table(data, values = 'rating', 
                               index = 'user_id', 
                               columns = 'item_id', 
                               aggfunc = np.max)
print(utilityMatrix.shape)
movieUtilityMatrix = utilityMatrix.T
print(movieUtilityMatrix)

(943, 1682)
user_id  1    2    3    4    5    6    7    8    9    10   ...  934  935  936  \
item_id                                                    ...                  
1        5.0  4.0  NaN  NaN  4.0  4.0  NaN  NaN  NaN  4.0  ...  2.0  3.0  4.0   
2        3.0  NaN  NaN  NaN  3.0  NaN  NaN  NaN  NaN  NaN  ...  4.0  NaN  NaN   
3        4.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  4.0   
4        3.0  NaN  NaN  NaN  NaN  NaN  5.0  NaN  NaN  4.0  ...  5.0  NaN  NaN   
5        3.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
...      ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
1678     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
1679     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
1680     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
1681     NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN  NaN   
1682     NaN  Na

## Convert Data

    Surprise has its own data input handle methods
    We need to use reader to read the dataframe and convert to trainset
    Since we are care doing item_based, care about similarity between items, we change the column order

In [3]:
reader = Reader(rating_scale = (1, 5))
item_based_data = Dataset.load_from_df(data[['item_id', 'user_id', 'rating']], reader)
full_trainset = item_based_data.build_full_trainset()
trainSet, testSet = train_test_split(item_based_data, test_size = 0.2, random_state = 10)

## Train and Test for the Missing Ratings

    Model choosing NMF or SVD for matrix factorization
    NMF (Non-negative Matrix Factorization)
    SVD (Singular Vector Decomposition)
    random_state is seed so that the result could be repeated

In [4]:
model_NMF = NMF(n_factors = 15, random_state = 10)
model_SVD = SVD(n_factors = 15, random_state = 10)

model_NMF.fit(trainSet)
model_SVD.fit(trainSet)

# Use both models to predict the NAN ratings
# Check the RMSE of the predictions, RMSE is calculated based on the non-NAN value in the testSet
predictions_NMF = model_NMF.test(testSet)
predictions_SVD = model_SVD.test(testSet)
print("NMF model RMSE is {:.3f}".format(accuracy.rmse(predictions_NMF, verbose = False)))
print("SVD model RMSE is {:.3f}".format(accuracy.rmse(predictions_SVD, verbose = False)))

NMF model RMSE is 0.962
SVD model RMSE is 0.928


## Similarity Matrix

    Use the model in the whole dataset, and use similarity to recommend similar item (movie)
    The default similarity computation method MSD (Mean Square Difference), here we choose Pearson Difference
    Since we have change the column order in the "Convert Data" part, so for 'user_based' we set True here

In [20]:
full_trainset = item_based_data.build_full_trainset()
model_NMF_fullTrainSet = NMF(n_factors = 15, random_state = 10)
model_NMF_fullTrainSet.sim_options = {'name':'pearson', 'user_based':True}
model_SVD_fullTrainSet = SVD(n_factors = 15, random_state = 10)
model_SVD_fullTrainSet.sim_options = {'name':'pearson', 'user_based':True}
model_NMF_fullTrainSet.fit(full_trainset)
model_SVD_fullTrainSet.fit(full_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10faf3780>

In [22]:
simMatrix_NMF = model_NMF_fullTrainSet.compute_similarities() 
simMatrix_SVD = model_SVD_fullTrainSet.compute_similarities()
print('NMF similarity matrix shape: ', simMatrix_NMF.shape)
print('SVD similarity matrix shape: ', simMatrix_SVD.shape)

print('\nPrint the first 7 rows and 7 columns of the NMF similarity matrix:')
print(simMatrix_NMF[0:6][0:6])
print('\nPrint the first 7 rows and 7 columns of the SVD similarity matrix:')
print(simMatrix_SVD[0:6][0:6])

NMF similarity matrix shape:  (1682, 1682)
SVD similarity matrix shape:  (1682, 1682)

Print the first 7 rows and 7 columns of the NMF similarity matrix:
[[ 1.          0.42799905  0.         ...  0.          0.
   0.        ]
 [ 0.42799905  1.          0.13363062 ...  0.          0.
   0.        ]
 [ 0.          0.13363062  1.         ...  0.          0.
   0.        ]
 [ 0.07430376  0.2994642   0.78107061 ...  0.          0.
   0.        ]
 [ 0.56612358  0.24559122  0.         ...  0.          0.
   0.        ]
 [-0.10081339  0.22424013  0.         ...  0.          0.
   0.        ]]

Print the first 7 rows and 7 columns of the SVD similarity matrix:
[[ 1.          0.42799905  0.         ...  0.          0.
   0.        ]
 [ 0.42799905  1.          0.13363062 ...  0.          0.
   0.        ]
 [ 0.          0.13363062  1.         ...  0.          0.
   0.        ]
 [ 0.07430376  0.2994642   0.78107061 ...  0.          0.
   0.        ]
 [ 0.56612358  0.24559122  0.         ...  0.  

## Find Similar Items

In [23]:
# Import the user_id and movie_title mapping data
items = pd.read_csv('ml-100k/u.item', sep = '|', header = None, encoding = 'latin-1')
items = items.iloc[:, 0:2]
items.columns = ['item_id', 'movie_title']
print('Print part of the item_id and the corresponding movie_title:')
print(items.iloc[0:9, :])

# Select a movie and later we will recommend movies similar to this
print('\n Select a movie that I like, and find its item_id')
print(items[items['movie_title'].str.contains('Star Wars')])

Print part of the item_id and the corresponding movie_title:
   item_id                                        movie_title
0        1                                   Toy Story (1995)
1        2                                   GoldenEye (1995)
2        3                                  Four Rooms (1995)
3        4                                  Get Shorty (1995)
4        5                                     Copycat (1995)
5        6  Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6        7                              Twelve Monkeys (1995)
7        8                                        Babe (1995)
8        9                            Dead Man Walking (1995)

 Select a movie that I like, and find its item_id
    item_id       movie_title
49       50  Star Wars (1977)


In [24]:
# Similarity of this selected movie and the others
sim_selected = simMatrix_NMF[49]
print('\n Similarities based on the selected movie \n', sim_selected)


 Similarities based on the selected movie 
 [ 0.05365553  0.24856031 -0.875      ...  0.          0.
  0.        ]


In [25]:
# Find the indices of the k most similar items with heap
k = 10
k_sim_idx_heap = []
for i in range(len(sim_selected)):
    if i == 49:
        continue
    if len(k_sim_idx_heap) < k:
        heapq.heappush(k_sim_idx_heap, (sim_selected[i], i))
    else:
        if sim_selected[i] > k_sim_idx_heap[0][0]:
            heapq.heappop(k_sim_idx_heap)
            heapq.heappush(k_sim_idx_heap, (sim_selected[i], i))
print(k_sim_idx_heap)

[(1.0, 343), (1.0, 398), (1.0, 448), (1.0, 626), (1.0, 1085), (1.0, 1153), (1.0, 1160), (1.0, 1135), (1.0, 788), (1.0, 1308)]


In [26]:
# print out the corresponding movie_title
# since it is min heap, when printing we print from the last item to the first item
print('\n The recommemed movies are:')
for i in range(len(k_sim_idx_heap)-1, -1, -1):
    idx = k_sim_idx_heap[i][1]
    print(items.iloc[idx, 1])


 The recommemed movies are:
Very Natural Thing, A (1974)
Swimming with Sharks (1995)
Ghosts of Mississippi (1996)
Palookaville (1996)
Alphaville (1965)
It's My Party (1995)
Robin Hood: Prince of Thieves (1991)
Star Trek: The Motion Picture (1979)
Three Musketeers, The (1993)
Apostle, The (1997)
