In [0]:
!pip install surprise
from surprise import KNNBasic, Reader, Dataset
from collections import defaultdict
from operator import itemgetter
import csv, sys, os
import heapq

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
class Films:
    film_Label = {}
    label_Film = {}
    link1 = '/content/drive/My Drive/ml-latest-small/ratings_after_wrangling.csv'
    link2 = '/content/drive/My Drive/ml-latest-small/movies.csv'
    
    def download(self):

        # Look for files relative to the directory we are running from
        os.chdir(os.path.dirname(sys.argv[0]))

        ratingsDataset = 0
        self.film_Label = {}
        self.label_Film = {}

        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)

        ratingsDataset = Dataset.load_from_file(self.link1, reader=reader)

        with open(self.link2, newline='', encoding='ISO-8859-1') as csvfile:
            movieReader = csv.reader(csvfile)
            next(movieReader)  # Skip header line
            for row in movieReader:
                movieID = int(row[0])
                movieName = row[1]
                self.film_Label[movieID] = movieName
                self.label_Film[movieName] = movieID

        return ratingsDataset

    def filmLabel(self, movieID):
        if movieID in self.film_Label:
            return self.film_Label[movieID]
        else:
            return ""

# DashBoard

Load our data set and compute the user similarity matrix

In [0]:
USER = '70'
k = 10
ml = Films()
data = ml.download()

trainSet = data.build_full_trainset()

sim_options = {'name': 'cosine',
               'user_based': True
               }

In [0]:
model = KNNBasic(sim_options=sim_options)
model.fit(trainSet)
simsMatrix = model.compute_similarities()


Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


Get top N similar users to our test subject



In [0]:
testUserInnerID = trainSet.to_inner_uid(USER)
similarityRow = simsMatrix[testUserInnerID]

similarUsers = []
for innerID, score in enumerate(similarityRow):
    if (innerID != testUserInnerID):
        similarUsers.append((innerID, score))

kNeighbors = heapq.nlargest(k, similarUsers, key=lambda t: t[1])




Get the stuff they rated, and add up ratings for each item, weighted by user similarity

In [0]:
candidates = defaultdict(float)
for similarUser in kNeighbors:
    innerID = similarUser[0]
    userSimilarityScore = similarUser[1]
    theirRatings = trainSet.ur[innerID]
    for rating in theirRatings:
        candidates[rating[0]] += (rating[1] / 5.0) * userSimilarityScore

Build a dictionary of stuff the user has already seen

In [0]:
watched = {}
for itemID, rating in trainSet.ur[testUserInnerID]:
    watched[itemID] = 1

Get top-rated items from similar users:

In [0]:
pos = 0
for itemID, ratingSum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        movieID = trainSet.to_raw_iid(itemID)
        print(ml.filmLabel(int(movieID)), ratingSum)
        pos += 1
        if (pos > 5):
            break

Fugitive, The (1993) 5.2
Die Hard: With a Vengeance (1995) 4.800000000000001
Twister (1996) 4.8
Firm, The (1993) 4.6
True Lies (1994) 4.5
Silence of the Lambs, The (1991) 4.4
